@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
359359 ret float %r
360360}
361361
362+ ; No FMULV instruction so use knowledge about the architectural maximum size of
363+ ; an SVE register to "scalarise" the reduction.
364+
365+ define half @fmulv_nxv2f16 (half %init , <vscale x 2 x half > %a ) {
366+ ; CHECK-LABEL: fmulv_nxv2f16:
367+ ; CHECK: // %bb.0:
368+ ; CHECK-NEXT: fmov z2.h, #1.00000000
369+ ; CHECK-NEXT: ptrue p0.d
370+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
371+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
372+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
373+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
374+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
375+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
376+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
377+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
378+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
379+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
380+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
381+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
382+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
383+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
384+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
385+ ; CHECK-NEXT: fmul h0, h0, h1
386+ ; CHECK-NEXT: ret
387+ %res = call fast half @llvm.vector.reduce.fmul.nxv2f16 (half %init , <vscale x 2 x half > %a )
388+ ret half %res
389+ }
390+
391+ define half @fmulv_nxv4f16 (half %init , <vscale x 4 x half > %a ) {
392+ ; CHECK-LABEL: fmulv_nxv4f16:
393+ ; CHECK: // %bb.0:
394+ ; CHECK-NEXT: fmov z2.h, #1.00000000
395+ ; CHECK-NEXT: ptrue p0.s
396+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
397+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
398+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
399+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
400+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
401+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
402+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
403+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
404+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
405+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
406+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
407+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
408+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
409+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
410+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
411+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
412+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
413+ ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h
414+ ; CHECK-NEXT: fmul h0, h0, h1
415+ ; CHECK-NEXT: ret
416+ %res = call fast half @llvm.vector.reduce.fmul.nxv4f16 (half %init , <vscale x 4 x half > %a )
417+ ret half %res
418+ }
419+
420+ define half @fmulv_nxv8f16 (half %init , <vscale x 8 x half > %a ) {
421+ ; CHECK-LABEL: fmulv_nxv8f16:
422+ ; CHECK: // %bb.0:
423+ ; CHECK-NEXT: fmov z2.h, #1.00000000
424+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
425+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
426+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
427+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
428+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
429+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
430+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
431+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
432+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
433+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
434+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
435+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
436+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
437+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
438+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
439+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
440+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
441+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
442+ ; CHECK-NEXT: uzp2 z3.h, z1.h, z2.h
443+ ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
444+ ; CHECK-NEXT: fmul z1.h, z1.h, z3.h
445+ ; CHECK-NEXT: fmul h0, h0, h1
446+ ; CHECK-NEXT: ret
447+ %res = call fast half @llvm.vector.reduce.fmul.nxv8f16 (half %init , <vscale x 8 x half > %a )
448+ ret half %res
449+ }
450+
451+ define float @fmulv_nxv2f32 (float %init , <vscale x 2 x float > %a ) {
452+ ; CHECK-LABEL: fmulv_nxv2f32:
453+ ; CHECK: // %bb.0:
454+ ; CHECK-NEXT: fmov z2.s, #1.00000000
455+ ; CHECK-NEXT: ptrue p0.d
456+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
457+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
458+ ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
459+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
460+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
461+ ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
462+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
463+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
464+ ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
465+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
466+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
467+ ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
468+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
469+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
470+ ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s
471+ ; CHECK-NEXT: fmul s0, s0, s1
472+ ; CHECK-NEXT: ret
473+ %res = call fast float @llvm.vector.reduce.fmul.nxv2f32 (float %init , <vscale x 2 x float > %a )
474+ ret float %res
475+ }
476+
477+ define float @fmulv_nxv4f32 (float %init , <vscale x 4 x float > %a ) {
478+ ; CHECK-LABEL: fmulv_nxv4f32:
479+ ; CHECK: // %bb.0:
480+ ; CHECK-NEXT: fmov z2.s, #1.00000000
481+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
482+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
483+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
484+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
485+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
486+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
487+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
488+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
489+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
490+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
491+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
492+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
493+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
494+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
495+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
496+ ; CHECK-NEXT: uzp2 z3.s, z1.s, z2.s
497+ ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s
498+ ; CHECK-NEXT: fmul z1.s, z1.s, z3.s
499+ ; CHECK-NEXT: fmul s0, s0, s1
500+ ; CHECK-NEXT: ret
501+ %res = call fast float @llvm.vector.reduce.fmul.nxv4f32 (float %init , <vscale x 4 x float > %a )
502+ ret float %res
503+ }
504+
505+ define double @fmulv_nxv2f64 (double %init , <vscale x 2 x double > %a ) {
506+ ; CHECK-LABEL: fmulv_nxv2f64:
507+ ; CHECK: // %bb.0:
508+ ; CHECK-NEXT: fmov z2.d, #1.00000000
509+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
510+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
511+ ; CHECK-NEXT: fmul z1.d, z1.d, z3.d
512+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
513+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
514+ ; CHECK-NEXT: fmul z1.d, z1.d, z3.d
515+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
516+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
517+ ; CHECK-NEXT: fmul z1.d, z1.d, z3.d
518+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
519+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
520+ ; CHECK-NEXT: fmul z1.d, z1.d, z3.d
521+ ; CHECK-NEXT: uzp2 z3.d, z1.d, z2.d
522+ ; CHECK-NEXT: uzp1 z1.d, z1.d, z2.d
523+ ; CHECK-NEXT: fmul z1.d, z1.d, z3.d
524+ ; CHECK-NEXT: fmul d0, d0, d1
525+ ; CHECK-NEXT: ret
526+ %res = call fast double @llvm.vector.reduce.fmul.nxv2f64 (double %init , <vscale x 2 x double > %a )
527+ ret double %res
528+ }
529+
362530declare half @llvm.vector.reduce.fadd.nxv2f16 (half , <vscale x 2 x half >)
363531declare half @llvm.vector.reduce.fadd.nxv4f16 (half , <vscale x 4 x half >)
364532declare half @llvm.vector.reduce.fadd.nxv8f16 (half , <vscale x 8 x half >)
365- declare half @llvm.vector.reduce.fadd.nxv6f16 (half , <vscale x 6 x half >)
366- declare half @llvm.vector.reduce.fadd.nxv10f16 (half , <vscale x 10 x half >)
367- declare half @llvm.vector.reduce.fadd.nxv12f16 (half , <vscale x 12 x half >)
368533declare float @llvm.vector.reduce.fadd.nxv2f32 (float , <vscale x 2 x float >)
369534declare float @llvm.vector.reduce.fadd.nxv4f32 (float , <vscale x 4 x float >)
370535declare float @llvm.vector.reduce.fadd.nxv8f32 (float , <vscale x 8 x float >)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
397562declare float @llvm.vector.reduce.fminimum.nxv2f32 (<vscale x 2 x float >)
398563declare float @llvm.vector.reduce.fminimum.nxv4f32 (<vscale x 4 x float >)
399564declare double @llvm.vector.reduce.fminimum.nxv2f64 (<vscale x 2 x double >)
565+
566+ declare half @llvm.vector.reduce.fmul.nxv2f16 (half , <vscale x 2 x half >)
567+ declare half @llvm.vector.reduce.fmul.nxv4f16 (half , <vscale x 4 x half >)
568+ declare half @llvm.vector.reduce.fmul.nxv8f16 (half , <vscale x 8 x half >)
569+ declare float @llvm.vector.reduce.fmul.nxv2f32 (float , <vscale x 2 x float >)
570+ declare float @llvm.vector.reduce.fmul.nxv4f32 (float , <vscale x 4 x float >)
571+ declare double @llvm.vector.reduce.fmul.nxv2f64 (double , <vscale x 2 x double >)
0 commit comments