@@ -230,29 +230,34 @@ l2:
230230 ret <4 x i32 > %c
231231}
232232
233- define <4 x float > @fmul (< 4 x float > %x , ptr %y ) {
233+ define <4 x float > @fmul (ptr %x , ptr %y ) {
234234; CHECK-LABEL: fmul:
235235; CHECK: // %bb.0: // %entry
236- ; CHECK-NEXT: mov v1.16b, v0.16b
237- ; CHECK-NEXT: ldr q2, [x0]
238236; CHECK-NEXT: movi v0.2d, #0000000000000000
239- ; CHECK-NEXT: mov w8, #1 // =0x1
240- ; CHECK-NEXT: fmul v1.4s, v2.4s, v1.s[3]
237+ ; CHECK-NEXT: ldr s1, [x0]
238+ ; CHECK-NEXT: mov x8, xzr
241239; CHECK-NEXT: .LBB7_1: // %l1
242240; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
243- ; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s
244- ; CHECK-NEXT: subs w8, w8, #1
241+ ; CHECK-NEXT: ldr q2, [x1, x8]
242+ ; CHECK-NEXT: add x8, x8, #16
243+ ; CHECK-NEXT: cmp w8, #16
244+ ; CHECK-NEXT: fmul v2.4s, v2.4s, v1.s[0]
245+ ; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
245246; CHECK-NEXT: b.eq .LBB7_1
246247; CHECK-NEXT: // %bb.2: // %l2
247248; CHECK-NEXT: ret
248249entry:
249- %a = shufflevector <4 x float > %x , <4 x float > undef , <4 x i32 > <i32 3 , i32 3 , i32 3 , i32 3 >
250+ %x.val = load float , ptr %x
251+ %x.ins = insertelement <4 x float > poison, float %x.val , i64 0
252+ %a = shufflevector <4 x float > %x.ins , <4 x float > undef , <4 x i32 > zeroinitializer
250253 br label %l1
251254
252255l1:
253256 %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
254257 %q = phi <4 x float > [ zeroinitializer , %entry ], [ %c , %l1 ]
255- %l = load <4 x float >, ptr %y
258+ %idx.y = mul nuw nsw i32 %p , 4
259+ %ptr.y = getelementptr float , ptr %y , i32 %idx.y
260+ %l = load <4 x float >, ptr %ptr.y
256261 %b = fmul <4 x float > %l , %a
257262 %c = fadd <4 x float > %b , %q
258263 %pa = add i32 %p , 1
@@ -270,10 +275,9 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
270275; CHECK-NEXT: movi v0.2d, #0000000000000000
271276; CHECK-NEXT: ldr q2, [x0]
272277; CHECK-NEXT: mov w8, #1 // =0x1
273- ; CHECK-NEXT: dup v1.4s, v1.s[3]
274278; CHECK-NEXT: .LBB8_1: // %l1
275279; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
276- ; CHECK-NEXT: fmla v0.4s, v1 .4s, v2.4s
280+ ; CHECK-NEXT: fmla v0.4s, v2 .4s, v1.s[3]
277281; CHECK-NEXT: subs w8, w8, #1
278282; CHECK-NEXT: b.eq .LBB8_1
279283; CHECK-NEXT: // %bb.2: // %l2
@@ -418,6 +422,134 @@ l2:
418422 ret <4 x i32 > %r
419423}
420424
425+ ; We shouldn't sink without fullfp16.
426+ define <4 x half > @fmul_half (ptr %x , ptr %y ) {
427+ ; CHECK-LABEL: fmul_half:
428+ ; CHECK: // %bb.0: // %entry
429+ ; CHECK-NEXT: ld1r { v1.4h }, [x0]
430+ ; CHECK-NEXT: movi d0, #0000000000000000
431+ ; CHECK-NEXT: mov x8, xzr
432+ ; CHECK-NEXT: fcvtl v1.4s, v1.4h
433+ ; CHECK-NEXT: .LBB13_1: // %l1
434+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
435+ ; CHECK-NEXT: ldr d2, [x1, x8]
436+ ; CHECK-NEXT: fcvtl v0.4s, v0.4h
437+ ; CHECK-NEXT: add x8, x8, #8
438+ ; CHECK-NEXT: cmp w8, #8
439+ ; CHECK-NEXT: fcvtl v2.4s, v2.4h
440+ ; CHECK-NEXT: fmul v2.4s, v2.4s, v1.4s
441+ ; CHECK-NEXT: fcvtn v2.4h, v2.4s
442+ ; CHECK-NEXT: fcvtl v2.4s, v2.4h
443+ ; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
444+ ; CHECK-NEXT: fcvtn v0.4h, v0.4s
445+ ; CHECK-NEXT: b.eq .LBB13_1
446+ ; CHECK-NEXT: // %bb.2: // %l2
447+ ; CHECK-NEXT: ret
448+ entry:
449+ %x.val = load half , ptr %x
450+ %x.ins = insertelement <4 x half > poison, half %x.val , i64 0
451+ %a = shufflevector <4 x half > %x.ins , <4 x half > undef , <4 x i32 > zeroinitializer
452+ br label %l1
453+
454+ l1:
455+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
456+ %q = phi <4 x half > [ zeroinitializer , %entry ], [ %c , %l1 ]
457+ %idx.y = mul nuw nsw i32 %p , 4
458+ %ptr.y = getelementptr half , ptr %y , i32 %idx.y
459+ %l = load <4 x half >, ptr %ptr.y
460+ %b = fmul <4 x half > %l , %a
461+ %c = fadd <4 x half > %b , %q
462+ %pa = add i32 %p , 1
463+ %c1 = icmp eq i32 %p , 0
464+ br i1 %c1 , label %l1 , label %l2
465+
466+ l2:
467+ ret <4 x half > %c
468+ }
469+
470+ define <4 x half > @fmul_half_fullfp16 (ptr %x , ptr %y ) "target-features" ="+fullfp16" {
471+ ; CHECK-LABEL: fmul_half_fullfp16:
472+ ; CHECK: // %bb.0: // %entry
473+ ; CHECK-NEXT: movi d0, #0000000000000000
474+ ; CHECK-NEXT: ldr h1, [x0]
475+ ; CHECK-NEXT: mov x8, xzr
476+ ; CHECK-NEXT: .LBB14_1: // %l1
477+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
478+ ; CHECK-NEXT: ldr d2, [x1, x8]
479+ ; CHECK-NEXT: add x8, x8, #8
480+ ; CHECK-NEXT: cmp w8, #8
481+ ; CHECK-NEXT: fmul v2.4h, v2.4h, v1.h[0]
482+ ; CHECK-NEXT: fadd v0.4h, v2.4h, v0.4h
483+ ; CHECK-NEXT: b.eq .LBB14_1
484+ ; CHECK-NEXT: // %bb.2: // %l2
485+ ; CHECK-NEXT: ret
486+ entry:
487+ %x.val = load half , ptr %x
488+ %x.ins = insertelement <4 x half > poison, half %x.val , i64 0
489+ %a = shufflevector <4 x half > %x.ins , <4 x half > undef , <4 x i32 > zeroinitializer
490+ br label %l1
491+
492+ l1:
493+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
494+ %q = phi <4 x half > [ zeroinitializer , %entry ], [ %c , %l1 ]
495+ %idx.y = mul nuw nsw i32 %p , 4
496+ %ptr.y = getelementptr half , ptr %y , i32 %idx.y
497+ %l = load <4 x half >, ptr %ptr.y
498+ %b = fmul <4 x half > %l , %a
499+ %c = fadd <4 x half > %b , %q
500+ %pa = add i32 %p , 1
501+ %c1 = icmp eq i32 %p , 0
502+ br i1 %c1 , label %l1 , label %l2
503+
504+ l2:
505+ ret <4 x half > %c
506+ }
507+
508+ ; We shouldn't sink the splat operand for scalable vectors.
509+ define <vscale x 4 x float > @fmul_scalable (ptr %x , ptr %y ) "target-features" ="+sve" {
510+ ; CHECK-LABEL: fmul_scalable:
511+ ; CHECK: // %bb.0: // %entry
512+ ; CHECK-NEXT: ptrue p0.s
513+ ; CHECK-NEXT: rdvl x8, #1
514+ ; CHECK-NEXT: mov z0.s, #0 // =0x0
515+ ; CHECK-NEXT: sxtw x8, w8
516+ ; CHECK-NEXT: mov w9, #1 // =0x1
517+ ; CHECK-NEXT: ld1rw { z1.s }, p0/z, [x0]
518+ ; CHECK-NEXT: lsl x8, x8, #2
519+ ; CHECK-NEXT: .LBB15_1: // %l1
520+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
521+ ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
522+ ; CHECK-NEXT: subs w9, w9, #1
523+ ; CHECK-NEXT: add x1, x1, x8
524+ ; CHECK-NEXT: fmul z2.s, z2.s, z1.s
525+ ; CHECK-NEXT: fadd z0.s, z2.s, z0.s
526+ ; CHECK-NEXT: b.eq .LBB15_1
527+ ; CHECK-NEXT: // %bb.2: // %l2
528+ ; CHECK-NEXT: ret
529+ entry:
530+ %x.val = load float , ptr %x
531+ %x.ins = insertelement <vscale x 4 x float > poison, float %x.val , i64 0
532+ %a = shufflevector <vscale x 4 x float > %x.ins , <vscale x 4 x float > undef , <vscale x 4 x i32 > zeroinitializer
533+ %33 = tail call i32 @llvm.vscale.i32 ()
534+ %34 = shl nuw nsw i32 %33 , 4
535+ br label %l1
536+
537+ l1:
538+ %p = phi i32 [ 0 , %entry ], [ %pa , %l1 ]
539+ %q = phi <vscale x 4 x float > [ zeroinitializer , %entry ], [ %c , %l1 ]
540+ %idx.y = mul nuw nsw i32 %p , %34
541+ %ptr.y = getelementptr float , ptr %y , i32 %idx.y
542+ %l = load <vscale x 4 x float >, ptr %ptr.y
543+ %b = fmul <vscale x 4 x float > %l , %a
544+ %c = fadd <vscale x 4 x float > %b , %q
545+ %pa = add i32 %p , 1
546+ %c1 = icmp eq i32 %p , 0
547+ br i1 %c1 , label %l1 , label %l2
548+
549+ l2:
550+ ret <vscale x 4 x float > %c
551+ }
552+
421553
422554declare <4 x i32 > @llvm.aarch64.neon.smull.v4i32 (<4 x i16 >, <4 x i16 >)
423555declare <4 x i32 > @llvm.aarch64.neon.umull.v4i32 (<4 x i16 >, <4 x i16 >)
0 commit comments