[AArch64] Add a phase-ordering test for a mla reduction sum. NFC

davemgreen · davemgreen · commit a97684303348 · 2025-08-10T15:40:12.000+01:00
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -O3 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully.
+
+define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) #0 {
+; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq
+; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <16 x float> [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP5]])
+; CHECK-NEXT:    ret float [[TMP6]]
+;
+  %3 = alloca ptr, align 8
+  %4 = alloca ptr, align 8
+  %5 = alloca float, align 4
+  %6 = alloca i32, align 4
+  store ptr %0, ptr %3, align 8, !tbaa !4
+  store ptr %1, ptr %4, align 8, !tbaa !4
+  call void @llvm.lifetime.start.p0(ptr %5) #2
+  store float 0.000000e+00, ptr %5, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(ptr %6) #2
+  store i32 0, ptr %6, align 4, !tbaa !11
+  br label %7
+
+7:                                                ; preds = %25, %2
+  %8 = load i32, ptr %6, align 4, !tbaa !11
+  %9 = icmp slt i32 %8, 16
+  br i1 %9, label %11, label %10
+
+10:                                               ; preds = %7
+  call void @llvm.lifetime.end.p0(ptr %6) #2
+  br label %28
+
+11:                                               ; preds = %7
+  %12 = load ptr, ptr %3, align 8, !tbaa !4
+  %13 = load i32, ptr %6, align 4, !tbaa !11
+  %14 = sext i32 %13 to i64
+  %15 = getelementptr inbounds float, ptr %12, i64 %14
+  %16 = load float, ptr %15, align 4, !tbaa !9
+  %17 = load ptr, ptr %4, align 8, !tbaa !4
+  %18 = load i32, ptr %6, align 4, !tbaa !11
+  %19 = sext i32 %18 to i64
+  %20 = getelementptr inbounds float, ptr %17, i64 %19
+  %21 = load float, ptr %20, align 4, !tbaa !9
+  %22 = fmul fast float %16, %21
+  %23 = load float, ptr %5, align 4, !tbaa !9
+  %24 = fadd fast float %23, %22
+  store float %24, ptr %5, align 4, !tbaa !9
+  br label %25
+
+25:                                               ; preds = %11
+  %26 = load i32, ptr %6, align 4, !tbaa !11
+  %27 = add nsw i32 %26, 1
+  store i32 %27, ptr %6, align 4, !tbaa !11
+  br label %7, !llvm.loop !13
+
+28:                                               ; preds = %10
+  %29 = load float, ptr %5, align 4, !tbaa !9
+  call void @llvm.lifetime.end.p0(ptr %5) #2
+  ret float %29
+}
+
+declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
+declare void @llvm.lifetime.end.p0(ptr captures(none)) #1
+
+attributes #0 = { nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 22.0.0git"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"p1 float", !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"float", !7, i64 0}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !7, i64 0}
+!13 = distinct !{!13, !14}
+!14 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -O3 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+; This function (a more complex reduction of (a[i] - b[i]) * itself) should be vectorized successfully.
+
+define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #0 {
+; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
+; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  .preheader.i:
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = load <20 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load <20 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <20 x float> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <20 x float> [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 80
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub fast float [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast float [[TMP14]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP15]], <20 x float> [[TMP9]])
+; CHECK-NEXT:    [[TMP18:%.*]] = load <20 x float>, ptr [[TMP16]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <20 x float>, ptr [[TMP17]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fsub fast <20 x float> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul fast <20 x float> [[TMP20]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP16]], i64 80
+; CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 80
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fsub fast float [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast float [[TMP26]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_1:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP27]], <20 x float> [[TMP21]])
+; CHECK-NEXT:    [[OP_RDX3_1:%.*]] = fadd fast float [[OP_RDX_1]], [[OP_RDX]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <20 x float>, ptr [[TMP28]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load <20 x float>, ptr [[TMP29]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP32:%.*]] = fsub fast <20 x float> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = fmul fast <20 x float> [[TMP32]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP28]], i64 80
+; CHECK-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP29]], i64 80
+; CHECK-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP38:%.*]] = fsub fast float [[TMP35]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = fmul fast float [[TMP38]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_2:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP39]], <20 x float> [[TMP33]])
+; CHECK-NEXT:    [[OP_RDX3_2:%.*]] = fadd fast float [[OP_RDX_2]], [[OP_RDX3_1]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load <20 x float>, ptr [[TMP40]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load <20 x float>, ptr [[TMP41]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP44:%.*]] = fsub fast <20 x float> [[TMP42]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = fmul fast <20 x float> [[TMP44]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP40]], i64 80
+; CHECK-NEXT:    [[TMP47:%.*]] = load float, ptr [[TMP46]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP41]], i64 80
+; CHECK-NEXT:    [[TMP49:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fsub fast float [[TMP47]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fmul fast float [[TMP50]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, ptr [[TMP40]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP41]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_3:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP51]], <20 x float> [[TMP45]])
+; CHECK-NEXT:    [[OP_RDX3_3:%.*]] = fadd fast float [[OP_RDX_3]], [[OP_RDX3_2]]
+; CHECK-NEXT:    [[TMP54:%.*]] = load <20 x float>, ptr [[TMP52]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load <20 x float>, ptr [[TMP53]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP56:%.*]] = fsub fast <20 x float> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = fmul fast <20 x float> [[TMP56]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP52]], i64 80
+; CHECK-NEXT:    [[TMP59:%.*]] = load float, ptr [[TMP58]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP53]], i64 80
+; CHECK-NEXT:    [[TMP61:%.*]] = load float, ptr [[TMP60]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP62:%.*]] = fsub fast float [[TMP59]], [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP62]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP52]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP53]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_4:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP63]], <20 x float> [[TMP57]])
+; CHECK-NEXT:    [[OP_RDX3_4:%.*]] = fadd fast float [[OP_RDX_4]], [[OP_RDX3_3]]
+; CHECK-NEXT:    [[TMP66:%.*]] = load <20 x float>, ptr [[TMP64]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load <20 x float>, ptr [[TMP65]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP68:%.*]] = fsub fast <20 x float> [[TMP66]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast <20 x float> [[TMP68]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP64]], i64 80
+; CHECK-NEXT:    [[TMP71:%.*]] = load float, ptr [[TMP70]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP65]], i64 80
+; CHECK-NEXT:    [[TMP73:%.*]] = load float, ptr [[TMP72]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP74:%.*]] = fsub fast float [[TMP71]], [[TMP73]]
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul fast float [[TMP74]], [[TMP74]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP64]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i64 [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX_5:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP75]], <20 x float> [[TMP69]])
+; CHECK-NEXT:    [[OP_RDX3_5:%.*]] = fadd fast float [[OP_RDX_5]], [[OP_RDX3_4]]
+; CHECK-NEXT:    [[TMP78:%.*]] = load <20 x float>, ptr [[TMP76]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP79:%.*]] = load <20 x float>, ptr [[TMP77]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP80:%.*]] = fsub fast <20 x float> [[TMP78]], [[TMP79]]
+; CHECK-NEXT:    [[TMP81:%.*]] = fmul fast <20 x float> [[TMP80]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP76]], i64 80
+; CHECK-NEXT:    [[TMP83:%.*]] = load float, ptr [[TMP82]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP84:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP77]], i64 80
+; CHECK-NEXT:    [[TMP85:%.*]] = load float, ptr [[TMP84]], align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP86:%.*]] = fsub fast float [[TMP83]], [[TMP85]]
+; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], [[TMP86]]
+; CHECK-NEXT:    [[OP_RDX_6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v20f32(float [[TMP87]], <20 x float> [[TMP81]])
+; CHECK-NEXT:    [[OP_RDX3_6:%.*]] = fadd fast float [[OP_RDX_6]], [[OP_RDX3_5]]
+; CHECK-NEXT:    ret float [[OP_RDX3_6]]
+;
+  %5 = alloca ptr, align 8
+  %6 = alloca ptr, align 8
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  store ptr %0, ptr %5, align 8, !tbaa !4
+  store ptr %1, ptr %6, align 8, !tbaa !4
+  store i32 %2, ptr %7, align 4, !tbaa !9
+  store i32 %3, ptr %8, align 4, !tbaa !9
+  %9 = load ptr, ptr %5, align 8, !tbaa !4
+  %10 = load ptr, ptr %6, align 8, !tbaa !4
+  %11 = load i32, ptr %7, align 4, !tbaa !9
+  %12 = load i32, ptr %8, align 4, !tbaa !9
+  %13 = call fast noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %9, ptr noundef %10, i32 noundef %11, i32 noundef %12)
+  ret float %13
+}
+
+define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #1 {
+  %5 = alloca ptr, align 8
+  %6 = alloca ptr, align 8
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i32, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca float, align 4
+  %13 = alloca i32, align 4
+  %14 = alloca i32, align 4
+  %15 = alloca float, align 4
+  %16 = alloca i32, align 4
+  %17 = alloca float, align 4
+  store ptr %0, ptr %5, align 8, !tbaa !4
+  store ptr %1, ptr %6, align 8, !tbaa !4
+  store i32 %2, ptr %7, align 4, !tbaa !9
+  store i32 %3, ptr %8, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(ptr %9) #3
+  store i32 3, ptr %9, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(ptr %10) #3
+  store i32 3, ptr %10, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(ptr %11) #3
+  store i32 7, ptr %11, align 4, !tbaa !9
+  call void @llvm.lifetime.start.p0(ptr %12) #3
+  store float 0.000000e+00, ptr %12, align 4, !tbaa !11
+  call void @llvm.lifetime.start.p0(ptr %13) #3
+  store i32 0, ptr %13, align 4, !tbaa !9
+  br label %18
+
+18:                                               ; preds = %59, %4
+  %19 = load i32, ptr %13, align 4, !tbaa !9
+  %20 = icmp slt i32 %19, 7
+  br i1 %20, label %22, label %21
+
+21:                                               ; preds = %18
+  store i32 2, ptr %14, align 4
+  call void @llvm.lifetime.end.p0(ptr %13) #3
+  br label %62
+
+22:                                               ; preds = %18
+  call void @llvm.lifetime.start.p0(ptr %15) #3
+  store float 0.000000e+00, ptr %15, align 4, !tbaa !11
+  call void @llvm.lifetime.start.p0(ptr %16) #3
+  store i32 0, ptr %16, align 4, !tbaa !9
+  br label %23
+
+23:                                               ; preds = %44, %22
+  %24 = load i32, ptr %16, align 4, !tbaa !9
+  %25 = icmp slt i32 %24, 21
+  br i1 %25, label %27, label %26
+
+26:                                               ; preds = %23
+  store i32 5, ptr %14, align 4
+  call void @llvm.lifetime.end.p0(ptr %16) #3
+  br label %47
+
+27:                                               ; preds = %23
+  call void @llvm.lifetime.start.p0(ptr %17) #3
+  %28 = load ptr, ptr %5, align 8, !tbaa !4
+  %29 = load i32, ptr %16, align 4, !tbaa !9
+  %30 = sext i32 %29 to i64
+  %31 = getelementptr inbounds float, ptr %28, i64 %30
+  %32 = load float, ptr %31, align 4, !tbaa !11
+  %33 = load ptr, ptr %6, align 8, !tbaa !4
+  %34 = load i32, ptr %16, align 4, !tbaa !9
+  %35 = sext i32 %34 to i64
+  %36 = getelementptr inbounds float, ptr %33, i64 %35
+  %37 = load float, ptr %36, align 4, !tbaa !11
+  %38 = fsub fast float %32, %37
+  store float %38, ptr %17, align 4, !tbaa !11
+  %39 = load float, ptr %17, align 4, !tbaa !11
+  %40 = load float, ptr %17, align 4, !tbaa !11
+  %41 = fmul fast float %39, %40
+  %42 = load float, ptr %15, align 4, !tbaa !11
+  %43 = fadd fast float %42, %41
+  store float %43, ptr %15, align 4, !tbaa !11
+  call void @llvm.lifetime.end.p0(ptr %17) #3
+  br label %44
+
+44:                                               ; preds = %27
+  %45 = load i32, ptr %16, align 4, !tbaa !9
+  %46 = add nsw i32 %45, 1
+  store i32 %46, ptr %16, align 4, !tbaa !9
+  br label %23, !llvm.loop !13
+
+47:                                               ; preds = %26
+  %48 = load i32, ptr %7, align 4, !tbaa !9
+  %49 = load ptr, ptr %5, align 8, !tbaa !4
+  %50 = sext i32 %48 to i64
+  %51 = getelementptr inbounds float, ptr %49, i64 %50
+  store ptr %51, ptr %5, align 8, !tbaa !4
+  %52 = load i32, ptr %8, align 4, !tbaa !9
+  %53 = load ptr, ptr %6, align 8, !tbaa !4
+  %54 = sext i32 %52 to i64
+  %55 = getelementptr inbounds float, ptr %53, i64 %54
+  store ptr %55, ptr %6, align 8, !tbaa !4
+  %56 = load float, ptr %15, align 4, !tbaa !11
+  %57 = load float, ptr %12, align 4, !tbaa !11
+  %58 = fadd fast float %57, %56
+  store float %58, ptr %12, align 4, !tbaa !11
+  call void @llvm.lifetime.end.p0(ptr %15) #3
+  br label %59
+
+59:                                               ; preds = %47
+  %60 = load i32, ptr %13, align 4, !tbaa !9
+  %61 = add nsw i32 %60, 1
+  store i32 %61, ptr %13, align 4, !tbaa !9
+  br label %18, !llvm.loop !15
+
+62:                                               ; preds = %21
+  %63 = load float, ptr %12, align 4, !tbaa !11
+  store i32 1, ptr %14, align 4
+  call void @llvm.lifetime.end.p0(ptr %12) #3
+  call void @llvm.lifetime.end.p0(ptr %11) #3
+  call void @llvm.lifetime.end.p0(ptr %10) #3
+  call void @llvm.lifetime.end.p0(ptr %9) #3
+  ret float %63
+}
+
+declare void @llvm.lifetime.start.p0(ptr captures(none)) #2
+declare void @llvm.lifetime.end.p0(ptr captures(none)) #2
+
+attributes #0 = { mustprogress uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
+attributes #1 = { inlinehint mustprogress nounwind uwtable "approx-func-fp-math"="true" "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 22.0.0git"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"p1 float", !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"float", !7, i64 0}
+!13 = distinct !{!13, !14}
+!14 = !{!"llvm.loop.mustprogress"}
+!15 = distinct !{!15, !14}