Add scalable test

hazzlim · hazzlim · commit 6b9c2e2c2a32 · 2024-11-14T16:07:57.000Z
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s
 
 define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: smull:
@@ -422,6 +422,52 @@ l2:
   ret <4 x i32> %r
 }
 
+; We shouldn't sink the splat operand for scalable vectors.
+define <vscale x 4 x float> @fmul_scalable(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_scalable:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sxtw x8, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov z1.s, s1
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:  .LBB13_1: // %l1
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    subs w9, w9, #1
+; CHECK-NEXT:    add x1, x1, x8
+; CHECK-NEXT:    fmul z2.s, z2.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z2.s, z0.s
+; CHECK-NEXT:    b.eq .LBB13_1
+; CHECK-NEXT:  // %bb.2: // %l2
+; CHECK-NEXT:    ret
+entry:
+  %x.val = load float, ptr %x
+  %x.ins = insertelement <vscale x 4 x float> poison, float %x.val, i64 0
+  %a = shufflevector <vscale x 4 x float> %x.ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  %33 = tail call i32 @llvm.vscale.i32()
+  %34 = shl nuw nsw i32 %33, 4
+  br label %l1
+
+l1:
+  %p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
+  %q = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
+  %idx.y = mul nuw nsw i32 %p, %34
+  %ptr.y = getelementptr float, ptr %y, i32 %idx.y
+  %l = load <vscale x 4 x float>, ptr %ptr.y
+  %b = fmul <vscale x 4 x float> %l, %a
+  %c = fadd <vscale x 4 x float> %b, %q
+  %pa = add i32 %p, 1
+  %c1 = icmp eq i32 %p, 0
+  br i1 %c1, label %l1, label %l2
+
+l2:
+  ret <vscale x 4 x float> %c
+}
+
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)