diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll index 0d4c053551011..5547567092903 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll @@ -1,29 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "aba" --filter "abd" --filter "add" --version 5 ; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown -mcpu=neoverse-v2 -o - | FileCheck %s define i64 @sabalb_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalb_i32_to_i64_accumulation +; CHECK-LABEL: sabalb_i32_to_i64_accumulation: +; CHECK: sabdlb z1.d, z1.s, z2.s +; CHECK: sabalb z0.d, z3.s, z4.s +; CHECK: sabalb z1.d, z4.s, z2.s +; CHECK: sabdlb z2.d, z3.s, z2.s +; CHECK: sabalb z2.d, z4.s, z3.s +; CHECK: sabalb z0.d, z3.s, z4.s +; CHECK: sabalb z1.d, z3.s, z4.s +; CHECK: sabalb z2.d, z3.s, z4.s +; CHECK: sabalb z0.d, z3.s, z4.s +; CHECK: sabalb z1.d, z3.s, z4.s +; CHECK: sabalb z2.d, z3.s, z4.s +; CHECK: sabalb z0.d, z3.s, z4.s +; CHECK: sabalb z1.d, z3.s, z4.s +; CHECK: sabalb z2.d, z3.s, z4.s +; CHECK: sabalb z0.d, z3.s, z4.s +; CHECK: add z0.d, z2.d, z0.d +; CHECK: sabalb z1.d, z3.s, z4.s +; CHECK: add z0.d, z0.d, z1.d +; CHECK: uaddv d0, p0, z0.d entry: br label %loop loop: -; CHECK: sabdlb -; CHECK: sabalb z0.d -; CHECK: sabalb z1.d -; CHECK: sabalb z2.d -; CHECK: add z0.d, z2.d, z0.d -; CHECK: add z0.d, z0.d, z1.d -; CHECK: uaddv d0, p0, z0.d %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalb.nxv2i64( %acc_phi, %a, %b) - - %next_i = add i32 %i, 4 - %cmp = icmp slt i32 %next_i, 64 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i64 @llvm.vector.reduce.add.nxv2i64( %acc_next) @@ -34,29 +47,41 @@ declare @llvm.aarch64.sve.sabalb.nxv2i64(, declare i64 @llvm.vector.reduce.add.nxv2i64() define i32 @sabalb_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalb_i16_to_i32_accumulation +; CHECK-LABEL: sabalb_i16_to_i32_accumulation: +; CHECK: sabdlb z1.s, z1.h, z2.h +; CHECK: sabalb z0.s, z3.h, z4.h +; CHECK: sabalb z1.s, z4.h, z2.h +; CHECK: sabdlb z2.s, z3.h, z2.h +; CHECK: sabalb z2.s, z4.h, z3.h +; CHECK: sabalb z0.s, z3.h, z4.h +; CHECK: sabalb z1.s, z3.h, z4.h +; CHECK: sabalb z2.s, z3.h, z4.h +; CHECK: sabalb z0.s, z3.h, z4.h +; CHECK: sabalb z1.s, z3.h, z4.h +; CHECK: sabalb z2.s, z3.h, z4.h +; CHECK: sabalb z0.s, z3.h, z4.h +; CHECK: sabalb z1.s, z3.h, z4.h +; CHECK: sabalb z2.s, z3.h, z4.h +; CHECK: sabalb z0.s, z3.h, z4.h +; CHECK: add z0.s, z2.s, z0.s +; CHECK: sabalb z1.s, z3.h, z4.h +; CHECK: add z0.s, z0.s, z1.s +; CHECK: uaddv d0, p0, z0.s entry: br label %loop loop: -; CHECK: sabdlb -; CHECK: sabalb z0.s -; CHECK: sabalb z1.s -; CHECK: sabalb z2.s -; CHECK: add z0.s, z2.s, z0.s -; CHECK: add z0.s, z0.s, z1.s -; CHECK: uaddv d0, p0, z0.s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalb.nxv4i32( %acc_phi, %a, %b) - - %next_i = add i32 %i, 8 - %cmp = icmp slt i32 %next_i, 128 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i32 @llvm.vector.reduce.add.nxv4i32( %acc_next) @@ -67,29 +92,41 @@ declare @llvm.aarch64.sve.sabalb.nxv4i32(, declare i32 @llvm.vector.reduce.add.nxv4i32() define i16 @sabalb_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalb_i8_to_i16_accumulation +; CHECK-LABEL: sabalb_i8_to_i16_accumulation: +; CHECK: sabdlb z1.h, z1.b, z2.b +; CHECK: sabalb z0.h, z3.b, z4.b +; CHECK: sabalb z1.h, z4.b, z2.b +; CHECK: sabdlb z2.h, z3.b, z2.b +; CHECK: sabalb z2.h, z4.b, z3.b +; CHECK: sabalb z0.h, z3.b, z4.b +; CHECK: sabalb z1.h, z3.b, z4.b +; CHECK: sabalb z2.h, z3.b, z4.b +; CHECK: sabalb z0.h, z3.b, z4.b +; CHECK: sabalb z1.h, z3.b, z4.b +; CHECK: sabalb z2.h, z3.b, z4.b +; CHECK: sabalb z0.h, z3.b, z4.b +; CHECK: sabalb z1.h, z3.b, z4.b +; CHECK: sabalb z2.h, z3.b, z4.b +; CHECK: sabalb z0.h, z3.b, z4.b +; CHECK: add z0.h, z2.h, z0.h +; CHECK: sabalb z1.h, z3.b, z4.b +; CHECK: add z0.h, z0.h, z1.h +; CHECK: uaddv d0, p0, z0.h entry: br label %loop loop: -; CHECK: sabdlb -; CHECK: sabalb z0.h -; CHECK: sabalb z1.h -; CHECK: sabalb z2.h -; CHECK: add z0.h, z2.h, z0.h -; CHECK: add z0.h, z0.h, z1.h -; CHECK: uaddv d0, p0, z0.h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalb.nxv8i16( %acc_phi, %a, %b) - - %next_i = add i32 %i, 16 - %cmp = icmp slt i32 %next_i, 256 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i16 @llvm.vector.reduce.add.nxv8i16( %acc_next) @@ -100,29 +137,41 @@ declare @llvm.aarch64.sve.sabalb.nxv8i16(, declare i16 @llvm.vector.reduce.add.nxv8i16() define i64 @sabalt_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalt_i32_to_i64_accumulation +; CHECK-LABEL: sabalt_i32_to_i64_accumulation: +; CHECK: sabdlt z1.d, z1.s, z2.s +; CHECK: sabalt z0.d, z3.s, z4.s +; CHECK: sabalt z1.d, z4.s, z2.s +; CHECK: sabdlt z2.d, z3.s, z2.s +; CHECK: sabalt z2.d, z4.s, z3.s +; CHECK: sabalt z0.d, z3.s, z4.s +; CHECK: sabalt z1.d, z3.s, z4.s +; CHECK: sabalt z2.d, z3.s, z4.s +; CHECK: sabalt z0.d, z3.s, z4.s +; CHECK: sabalt z1.d, z3.s, z4.s +; CHECK: sabalt z2.d, z3.s, z4.s +; CHECK: sabalt z0.d, z3.s, z4.s +; CHECK: sabalt z1.d, z3.s, z4.s +; CHECK: sabalt z2.d, z3.s, z4.s +; CHECK: sabalt z0.d, z3.s, z4.s +; CHECK: add z0.d, z2.d, z0.d +; CHECK: sabalt z1.d, z3.s, z4.s +; CHECK: add z0.d, z0.d, z1.d +; CHECK: uaddv d0, p0, z0.d entry: br label %loop loop: -; CHECK: sabdlt -; CHECK: sabalt z0.d -; CHECK: sabalt z1.d -; CHECK: sabalt z2.d -; CHECK: add z0.d, z2.d, z0.d -; CHECK: add z0.d, z0.d, z1.d -; CHECK: uaddv d0, p0, z0.d %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalt.nxv2i64( %acc_phi, %a, %b) - - %next_i = add i32 %i, 4 - %cmp = icmp slt i32 %next_i, 64 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i64 @llvm.vector.reduce.add.nxv2i64( %acc_next) @@ -132,29 +181,41 @@ exit: declare @llvm.aarch64.sve.sabalt.nxv2i64(, , ) define i32 @sabalt_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalt_i16_to_i32_accumulation +; CHECK-LABEL: sabalt_i16_to_i32_accumulation: +; CHECK: sabdlt z1.s, z1.h, z2.h +; CHECK: sabalt z0.s, z3.h, z4.h +; CHECK: sabalt z1.s, z4.h, z2.h +; CHECK: sabdlt z2.s, z3.h, z2.h +; CHECK: sabalt z2.s, z4.h, z3.h +; CHECK: sabalt z0.s, z3.h, z4.h +; CHECK: sabalt z1.s, z3.h, z4.h +; CHECK: sabalt z2.s, z3.h, z4.h +; CHECK: sabalt z0.s, z3.h, z4.h +; CHECK: sabalt z1.s, z3.h, z4.h +; CHECK: sabalt z2.s, z3.h, z4.h +; CHECK: sabalt z0.s, z3.h, z4.h +; CHECK: sabalt z1.s, z3.h, z4.h +; CHECK: sabalt z2.s, z3.h, z4.h +; CHECK: sabalt z0.s, z3.h, z4.h +; CHECK: add z0.s, z2.s, z0.s +; CHECK: sabalt z1.s, z3.h, z4.h +; CHECK: add z0.s, z0.s, z1.s +; CHECK: uaddv d0, p0, z0.s entry: br label %loop loop: -; CHECK: sabdlt -; CHECK: sabalt z0.s -; CHECK: sabalt z1.s -; CHECK: sabalt z2.s -; CHECK: add z0.s, z2.s, z0.s -; CHECK: add z0.s, z0.s, z1.s -; CHECK: uaddv d0, p0, z0.s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalt.nxv4i32( %acc_phi, %a, %b) - - %next_i = add i32 %i, 8 - %cmp = icmp slt i32 %next_i, 128 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i32 @llvm.vector.reduce.add.nxv4i32( %acc_next) @@ -164,29 +225,41 @@ exit: declare @llvm.aarch64.sve.sabalt.nxv4i32(, , ) define i16 @sabalt_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabalt_i8_to_i16_accumulation +; CHECK-LABEL: sabalt_i8_to_i16_accumulation: +; CHECK: sabdlt z1.h, z1.b, z2.b +; CHECK: sabalt z0.h, z3.b, z4.b +; CHECK: sabalt z1.h, z4.b, z2.b +; CHECK: sabdlt z2.h, z3.b, z2.b +; CHECK: sabalt z2.h, z4.b, z3.b +; CHECK: sabalt z0.h, z3.b, z4.b +; CHECK: sabalt z1.h, z3.b, z4.b +; CHECK: sabalt z2.h, z3.b, z4.b +; CHECK: sabalt z0.h, z3.b, z4.b +; CHECK: sabalt z1.h, z3.b, z4.b +; CHECK: sabalt z2.h, z3.b, z4.b +; CHECK: sabalt z0.h, z3.b, z4.b +; CHECK: sabalt z1.h, z3.b, z4.b +; CHECK: sabalt z2.h, z3.b, z4.b +; CHECK: sabalt z0.h, z3.b, z4.b +; CHECK: add z0.h, z2.h, z0.h +; CHECK: sabalt z1.h, z3.b, z4.b +; CHECK: add z0.h, z0.h, z1.h +; CHECK: uaddv d0, p0, z0.h entry: br label %loop loop: -; CHECK: sabdlt -; CHECK: sabalt z0.h -; CHECK: sabalt z1.h -; CHECK: sabalt z2.h -; CHECK: add z0.h, z2.h, z0.h -; CHECK: add z0.h, z0.h, z1.h -; CHECK: uaddv d0, p0, z0.h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.sabalt.nxv8i16( %acc_phi, %a, %b) - - %next_i = add i32 %i, 16 - %cmp = icmp slt i32 %next_i, 256 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i16 @llvm.vector.reduce.add.nxv8i16( %acc_next) @@ -196,29 +269,41 @@ exit: declare @llvm.aarch64.sve.sabalt.nxv8i16(, , ) define i64 @uabalb_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalb_i32_to_i64_accumulation +; CHECK-LABEL: uabalb_i32_to_i64_accumulation: +; CHECK: uabdlb z1.d, z1.s, z2.s +; CHECK: uabalb z0.d, z3.s, z4.s +; CHECK: uabalb z1.d, z4.s, z2.s +; CHECK: uabdlb z2.d, z3.s, z2.s +; CHECK: uabalb z2.d, z4.s, z3.s +; CHECK: uabalb z0.d, z3.s, z4.s +; CHECK: uabalb z1.d, z3.s, z4.s +; CHECK: uabalb z2.d, z3.s, z4.s +; CHECK: uabalb z0.d, z3.s, z4.s +; CHECK: uabalb z1.d, z3.s, z4.s +; CHECK: uabalb z2.d, z3.s, z4.s +; CHECK: uabalb z0.d, z3.s, z4.s +; CHECK: uabalb z1.d, z3.s, z4.s +; CHECK: uabalb z2.d, z3.s, z4.s +; CHECK: uabalb z0.d, z3.s, z4.s +; CHECK: add z0.d, z2.d, z0.d +; CHECK: uabalb z1.d, z3.s, z4.s +; CHECK: add z0.d, z0.d, z1.d +; CHECK: uaddv d0, p0, z0.d entry: br label %loop loop: -; CHECK: uabdlb -; CHECK: uabalb z0.d -; CHECK: uabalb z1.d -; CHECK: uabalb z2.d -; CHECK: add z0.d, z2.d, z0.d -; CHECK: add z0.d, z0.d, z1.d -; CHECK: uaddv d0, p0, z0.d %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalb.nxv2i64( %acc_phi, %a, %b) - - %next_i = add i32 %i, 4 - %cmp = icmp slt i32 %next_i, 64 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i64 @llvm.vector.reduce.add.nxv2i64( %acc_next) @@ -228,29 +313,41 @@ exit: declare @llvm.aarch64.sve.uabalb.nxv2i64(, , ) define i32 @uabalb_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalb_i16_to_i32_accumulation +; CHECK-LABEL: uabalb_i16_to_i32_accumulation: +; CHECK: uabdlb z1.s, z1.h, z2.h +; CHECK: uabalb z0.s, z3.h, z4.h +; CHECK: uabalb z1.s, z4.h, z2.h +; CHECK: uabdlb z2.s, z3.h, z2.h +; CHECK: uabalb z2.s, z4.h, z3.h +; CHECK: uabalb z0.s, z3.h, z4.h +; CHECK: uabalb z1.s, z3.h, z4.h +; CHECK: uabalb z2.s, z3.h, z4.h +; CHECK: uabalb z0.s, z3.h, z4.h +; CHECK: uabalb z1.s, z3.h, z4.h +; CHECK: uabalb z2.s, z3.h, z4.h +; CHECK: uabalb z0.s, z3.h, z4.h +; CHECK: uabalb z1.s, z3.h, z4.h +; CHECK: uabalb z2.s, z3.h, z4.h +; CHECK: uabalb z0.s, z3.h, z4.h +; CHECK: add z0.s, z2.s, z0.s +; CHECK: uabalb z1.s, z3.h, z4.h +; CHECK: add z0.s, z0.s, z1.s +; CHECK: uaddv d0, p0, z0.s entry: br label %loop loop: -; CHECK: uabdlb -; CHECK: uabalb z0.s -; CHECK: uabalb z1.s -; CHECK: uabalb z2.s -; CHECK: add z0.s, z2.s, z0.s -; CHECK: add z0.s, z0.s, z1.s -; CHECK: uaddv d0, p0, z0.s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalb.nxv4i32( %acc_phi, %a, %b) - - %next_i = add i32 %i, 8 - %cmp = icmp slt i32 %next_i, 128 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i32 @llvm.vector.reduce.add.nxv4i32( %acc_next) @@ -260,29 +357,41 @@ exit: declare @llvm.aarch64.sve.uabalb.nxv4i32(, , ) define i16 @uabalb_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalb_i8_to_i16_accumulation +; CHECK-LABEL: uabalb_i8_to_i16_accumulation: +; CHECK: uabdlb z1.h, z1.b, z2.b +; CHECK: uabalb z0.h, z3.b, z4.b +; CHECK: uabalb z1.h, z4.b, z2.b +; CHECK: uabdlb z2.h, z3.b, z2.b +; CHECK: uabalb z2.h, z4.b, z3.b +; CHECK: uabalb z0.h, z3.b, z4.b +; CHECK: uabalb z1.h, z3.b, z4.b +; CHECK: uabalb z2.h, z3.b, z4.b +; CHECK: uabalb z0.h, z3.b, z4.b +; CHECK: uabalb z1.h, z3.b, z4.b +; CHECK: uabalb z2.h, z3.b, z4.b +; CHECK: uabalb z0.h, z3.b, z4.b +; CHECK: uabalb z1.h, z3.b, z4.b +; CHECK: uabalb z2.h, z3.b, z4.b +; CHECK: uabalb z0.h, z3.b, z4.b +; CHECK: add z0.h, z2.h, z0.h +; CHECK: uabalb z1.h, z3.b, z4.b +; CHECK: add z0.h, z0.h, z1.h +; CHECK: uaddv d0, p0, z0.h entry: br label %loop loop: -; CHECK: uabdlb -; CHECK: uabalb z0.h -; CHECK: uabalb z1.h -; CHECK: uabalb z2.h -; CHECK: add z0.h, z2.h, z0.h -; CHECK: add z0.h, z0.h, z1.h -; CHECK: uaddv d0, p0, z0.h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalb.nxv8i16( %acc_phi, %a, %b) - - %next_i = add i32 %i, 16 - %cmp = icmp slt i32 %next_i, 256 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i16 @llvm.vector.reduce.add.nxv8i16( %acc_next) @@ -292,29 +401,41 @@ exit: declare @llvm.aarch64.sve.uabalb.nxv8i16(, , ) define i64 @uabalt_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalt_i32_to_i64_accumulation +; CHECK-LABEL: uabalt_i32_to_i64_accumulation: +; CHECK: uabdlt z1.d, z1.s, z2.s +; CHECK: uabalt z0.d, z3.s, z4.s +; CHECK: uabalt z1.d, z4.s, z2.s +; CHECK: uabdlt z2.d, z3.s, z2.s +; CHECK: uabalt z2.d, z4.s, z3.s +; CHECK: uabalt z0.d, z3.s, z4.s +; CHECK: uabalt z1.d, z3.s, z4.s +; CHECK: uabalt z2.d, z3.s, z4.s +; CHECK: uabalt z0.d, z3.s, z4.s +; CHECK: uabalt z1.d, z3.s, z4.s +; CHECK: uabalt z2.d, z3.s, z4.s +; CHECK: uabalt z0.d, z3.s, z4.s +; CHECK: uabalt z1.d, z3.s, z4.s +; CHECK: uabalt z2.d, z3.s, z4.s +; CHECK: uabalt z0.d, z3.s, z4.s +; CHECK: add z0.d, z2.d, z0.d +; CHECK: uabalt z1.d, z3.s, z4.s +; CHECK: add z0.d, z0.d, z1.d +; CHECK: uaddv d0, p0, z0.d entry: br label %loop loop: -; CHECK: uabdlt -; CHECK: uabalt z0.d -; CHECK: uabalt z1.d -; CHECK: uabalt z2.d -; CHECK: add z0.d, z2.d, z0.d -; CHECK: add z0.d, z0.d, z1.d -; CHECK: uaddv d0, p0, z0.d %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalt.nxv2i64( %acc_phi, %a, %b) - - %next_i = add i32 %i, 4 - %cmp = icmp slt i32 %next_i, 64 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i64 @llvm.vector.reduce.add.nxv2i64( %acc_next) @@ -324,29 +445,41 @@ exit: declare @llvm.aarch64.sve.uabalt.nxv2i64(, , ) define i32 @uabalt_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalt_i16_to_i32_accumulation +; CHECK-LABEL: uabalt_i16_to_i32_accumulation: +; CHECK: uabdlt z1.s, z1.h, z2.h +; CHECK: uabalt z0.s, z3.h, z4.h +; CHECK: uabalt z1.s, z4.h, z2.h +; CHECK: uabdlt z2.s, z3.h, z2.h +; CHECK: uabalt z2.s, z4.h, z3.h +; CHECK: uabalt z0.s, z3.h, z4.h +; CHECK: uabalt z1.s, z3.h, z4.h +; CHECK: uabalt z2.s, z3.h, z4.h +; CHECK: uabalt z0.s, z3.h, z4.h +; CHECK: uabalt z1.s, z3.h, z4.h +; CHECK: uabalt z2.s, z3.h, z4.h +; CHECK: uabalt z0.s, z3.h, z4.h +; CHECK: uabalt z1.s, z3.h, z4.h +; CHECK: uabalt z2.s, z3.h, z4.h +; CHECK: uabalt z0.s, z3.h, z4.h +; CHECK: add z0.s, z2.s, z0.s +; CHECK: uabalt z1.s, z3.h, z4.h +; CHECK: add z0.s, z0.s, z1.s +; CHECK: uaddv d0, p0, z0.s entry: br label %loop loop: -; CHECK: uabdlt -; CHECK: uabalt z0.s -; CHECK: uabalt z1.s -; CHECK: uabalt z2.s -; CHECK: add z0.s, z2.s, z0.s -; CHECK: add z0.s, z0.s, z1.s -; CHECK: uaddv d0, p0, z0.s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalt.nxv4i32( %acc_phi, %a, %b) - - %next_i = add i32 %i, 8 - %cmp = icmp slt i32 %next_i, 128 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i32 @llvm.vector.reduce.add.nxv4i32( %acc_next) @@ -356,29 +489,41 @@ exit: declare @llvm.aarch64.sve.uabalt.nxv4i32(, , ) define i16 @uabalt_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalt_i8_to_i16_accumulation +; CHECK-LABEL: uabalt_i8_to_i16_accumulation: +; CHECK: uabdlt z1.h, z1.b, z2.b +; CHECK: uabalt z0.h, z3.b, z4.b +; CHECK: uabalt z1.h, z4.b, z2.b +; CHECK: uabdlt z2.h, z3.b, z2.b +; CHECK: uabalt z2.h, z4.b, z3.b +; CHECK: uabalt z0.h, z3.b, z4.b +; CHECK: uabalt z1.h, z3.b, z4.b +; CHECK: uabalt z2.h, z3.b, z4.b +; CHECK: uabalt z0.h, z3.b, z4.b +; CHECK: uabalt z1.h, z3.b, z4.b +; CHECK: uabalt z2.h, z3.b, z4.b +; CHECK: uabalt z0.h, z3.b, z4.b +; CHECK: uabalt z1.h, z3.b, z4.b +; CHECK: uabalt z2.h, z3.b, z4.b +; CHECK: uabalt z0.h, z3.b, z4.b +; CHECK: add z0.h, z2.h, z0.h +; CHECK: uabalt z1.h, z3.b, z4.b +; CHECK: add z0.h, z0.h, z1.h +; CHECK: uaddv d0, p0, z0.h entry: br label %loop loop: -; CHECK: uabdlt -; CHECK: uabalt z0.h -; CHECK: uabalt z1.h -; CHECK: uabalt z2.h -; CHECK: add z0.h, z2.h, z0.h -; CHECK: add z0.h, z0.h, z1.h -; CHECK: uaddv d0, p0, z0.h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi [ zeroinitializer, %entry ], [ %acc_next, %loop ] - %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next = call @llvm.aarch64.sve.uabalt.nxv8i16( %acc_phi, %a, %b) - - %next_i = add i32 %i, 16 - %cmp = icmp slt i32 %next_i, 256 + + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %reduce = tail call i16 @llvm.vector.reduce.add.nxv8i16( %acc_next) @@ -388,23 +533,53 @@ exit: declare @llvm.aarch64.sve.uabalt.nxv8i16(, , ) define i16 @uabalt_and_uabalb_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: uabalt_and_uabalb_accumulation +; CHECK-LABEL: uabalt_and_uabalb_accumulation: +; CHECK: uabalb z1.h, z2.b, z3.b +; CHECK: uabalt z0.h, z2.b, z3.b +; CHECK: uabalb z1.h, z6.b, z7.b +; CHECK: uabalt z0.h, z6.b, z7.b +; CHECK: uabdlb z7.h, z4.b, z5.b +; CHECK: uabdlt z4.h, z4.b, z5.b +; CHECK: uabalb z7.h, z6.b, z5.b +; CHECK: uabalt z4.h, z6.b, z5.b +; CHECK: uabdlb z6.h, z2.b, z3.b +; CHECK: uabdlt z2.h, z2.b, z3.b +; CHECK: uabalb z6.h, z5.b, z3.b +; CHECK: uabalt z2.h, z5.b, z3.b +; CHECK: uabalb z1.h, z3.b, z5.b +; CHECK: uabalt z0.h, z3.b, z5.b +; CHECK: uabalb z7.h, z3.b, z5.b +; CHECK: uabalt z4.h, z3.b, z5.b +; CHECK: uabalb z6.h, z3.b, z5.b +; CHECK: uabalt z2.h, z3.b, z5.b +; CHECK: uabalb z1.h, z3.b, z5.b +; CHECK: uabalt z0.h, z3.b, z5.b +; CHECK: uabalb z7.h, z3.b, z5.b +; CHECK: uabalt z4.h, z3.b, z5.b +; CHECK: uabalb z6.h, z3.b, z5.b +; CHECK: uabalt z2.h, z3.b, z5.b +; CHECK: uabalb z1.h, z3.b, z5.b +; CHECK: uabalt z0.h, z3.b, z5.b +; CHECK: uabalb z7.h, z3.b, z5.b +; CHECK: uabalt z4.h, z3.b, z5.b +; CHECK: uabalb z6.h, z3.b, z5.b +; CHECK: uabalt z2.h, z3.b, z5.b +; CHECK: add z2.h, z4.h, z2.h +; CHECK: uabalb z1.h, z3.b, z5.b +; CHECK: uabalt z0.h, z3.b, z5.b +; CHECK: add z3.h, z7.h, z6.h +; CHECK: add z1.h, z3.h, z1.h +; CHECK: add z0.h, z2.h, z0.h +; CHECK: add z1.h, p0/m, z1.h, z0.h +; CHECK: uaddv d0, p0, z1.h entry: br label %loop loop: -; CHECK: uabdlt -; CHECK: uabdlb -; CHECK: uabalt z0.h -; CHECK: uabalt z2.h -; CHECK: uabalt z4.h -; CHECK: uabalb z1.h -; CHECK: uabalb z6.h -; CHECK: uabalb z5.h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_hi_phi = phi [ zeroinitializer, %entry ], [ %acc_next_hi, %loop ] %acc_lo_phi = phi [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ] - %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i - %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i + %ptr1_i = getelementptr , ptr %ptr1, i32 %i + %ptr2_i = getelementptr , ptr %ptr2, i32 %i %a = load , ptr %ptr1_i, align 1 %b = load , ptr %ptr2_i, align 1 %acc_next_lo = call @llvm.aarch64.sve.uabalb.nxv8i16( %acc_lo_phi, @@ -413,8 +588,8 @@ loop: %acc_next_hi = call @llvm.aarch64.sve.uabalt.nxv8i16( %acc_hi_phi, %a, %b) - %next_i = add i32 %i, 16 - %cmp = icmp slt i32 %next_i, 256 + %next_i = add i32 %i, 1 + %cmp = icmp slt i32 %next_i, 16 br i1 %cmp, label %loop, label %exit exit: %mask = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -423,4 +598,4 @@ exit: ret i16 %reduce } -declare @llvm.aarch64.sve.add.nxv8i16(, , ) \ No newline at end of file +declare @llvm.aarch64.sve.add.nxv8i16(, , ) diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll index 86150a8d3d3ce..a84d666c1be6b 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll @@ -1,23 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "aba" --filter "abd" --filter "add" --version 5 ; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=arm64e-apple-darwin -o - | FileCheck %s define i16 @sabal_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { - ; CHECK-LABEL: sabal_i8_to_i16_accumulation +; CHECK-LABEL: sabal_i8_to_i16_accumulation: +; CHECK: sabdl.8h v1, v1, v3 +; CHECK: sabdl.8h v0, v0, v2 +; CHECK: sabdl.8h v2, v3, v5 +; CHECK: sabal.8h v1, v4, v6 +; CHECK: sabal.8h v0, v7, v17 +; CHECK: sabal.8h v2, v16, v18 +; CHECK: sabal.8h v1, v19, v21 +; CHECK: sabal.8h v0, v20, v22 +; CHECK: add.8h v1, v2, v1 +; CHECK: add.8h v0, v1, v0 +; CHECK: addv.8h h0, v0 entry: br label %loop loop: -; CHECK: sabdl.8h v1 -; CHECK: sabdl.8h v0 -; CHECK: sabdl.8h v2 -; CHECK: sabal.8h v1 -; CHECK: sabal.8h v0 -; CHECK: sabal.8h v2 -; CHECK: sabal.8h v1 -; CHECK: sabal.8h v0 -; CHECK: add.8h v1, v2, v1 -; CHECK: add.8h v0, v1, v0 -; CHECK: addv.8h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -43,22 +44,22 @@ declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) define i32 @sabal_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: sabal_i16_to_i32_accumulation +; CHECK-LABEL: sabal_i16_to_i32_accumulation: +; CHECK: sabdl.4s v1, v1, v3 +; CHECK: sabdl.4s v0, v0, v2 +; CHECK: sabdl.4s v2, v3, v5 +; CHECK: sabal.4s v1, v4, v6 +; CHECK: sabal.4s v0, v7, v17 +; CHECK: sabal.4s v2, v16, v18 +; CHECK: sabal.4s v1, v19, v21 +; CHECK: sabal.4s v0, v20, v22 +; CHECK: add.4s v1, v2, v1 +; CHECK: add.4s v0, v1, v0 +; CHECK: addv.4s s0, v0 entry: br label %loop loop: -; CHECK: sabdl.4s v1 -; CHECK: sabdl.4s v0 -; CHECK: sabdl.4s v2 -; CHECK: sabal.4s v1 -; CHECK: sabal.4s v0 -; CHECK: sabal.4s v2 -; CHECK: sabal.4s v1 -; CHECK: sabal.4s v0 -; CHECK: add.4s v1, v2, v1 -; CHECK: add.4s v0, v1, v0 -; CHECK: addv.4s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -82,33 +83,33 @@ declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) define i16 @uabal2_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uabal2_accumulation +; CHECK-LABEL: uabal2_accumulation: +; CHECK: uabdl2.8h v4, v1, v3 +; CHECK: uabdl.8h v1, v1, v3 +; CHECK: uabdl2.8h v24, v0, v2 +; CHECK: uabdl2.8h v25, v3, v6 +; CHECK: uabal2.8h v4, v5, v7 +; CHECK: uabal2.8h v24, v16, v18 +; CHECK: uabal2.8h v25, v17, v19 +; CHECK: uabal2.8h v4, v20, v22 +; CHECK: uabal2.8h v24, v21, v23 +; CHECK: add.8h v4, v25, v4 +; CHECK: add.8h v4, v4, v24 +; CHECK: uabdl.8h v0, v0, v2 +; CHECK: uabdl.8h v2, v3, v6 +; CHECK: uabal.8h v1, v5, v7 +; CHECK: uabal.8h v0, v16, v18 +; CHECK: uabal.8h v2, v17, v19 +; CHECK: uabal.8h v1, v20, v22 +; CHECK: uabal.8h v0, v21, v23 +; CHECK: add.8h v1, v2, v1 +; CHECK: add.8h v0, v1, v0 +; CHECK: add.8h v0, v4, v0 +; CHECK: addv.8h h0, v0 entry: br label %loop loop: -; CHECK: uabdl2.8h v4 -; CHECK: uabdl.8h v1 -; CHECK: uabdl2.8h v24 -; CHECK: uabdl2.8h v25 -; CHECK: uabal2.8h v4 -; CHECK: uabal2.8h v24 -; CHECK: uabal2.8h v25 -; CHECK: uabal2.8h v4 -; CHECK: uabal2.8h v24 -; CHECK: add.8h v4, v25, v4 -; CHECK: add.8h v4, v4, v24 -; CHECK: uabdl.8h v0 -; CHECK: uabdl.8h v2 -; CHECK: uabal.8h v1 -; CHECK: uabal.8h v0 -; CHECK: uabal.8h v2 -; CHECK: uabal.8h v1 -; CHECK: uabal.8h v0 -; CHECK: add.8h v1, v2, v1 -; CHECK: add.8h v0, v1, v0 -; CHECK: add.8h v0, v4, v0 -; CHECK: addv.8h h0, v0 %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi_hi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_hi, %loop ] @@ -138,22 +139,22 @@ exit: } define i32 @uaba_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uaba_accumulation +; CHECK-LABEL: uaba_accumulation: +; CHECK: uabd.4s v0, v0, v2 +; CHECK: uabd.4s v1, v1, v3 +; CHECK: uabd.4s v2, v2, v5 +; CHECK: uaba.4s v0, v4, v6 +; CHECK: uaba.4s v1, v7, v17 +; CHECK: uaba.4s v2, v16, v18 +; CHECK: uaba.4s v0, v19, v21 +; CHECK: uaba.4s v1, v20, v22 +; CHECK: add.4s v0, v2, v0 +; CHECK: add.4s v0, v0, v1 +; CHECK: addv.4s s0, v0 entry: br label %loop loop: -; CHECK: uabd.4s v0 -; CHECK: uabd.4s v1 -; CHECK: uabd.4s v2 -; CHECK: uaba.4s v0 -; CHECK: uaba.4s v1 -; CHECK: uaba.4s v2 -; CHECK: uaba.4s v0 -; CHECK: uaba.4s v1 -; CHECK: add.4s v0, v2, v0 -; CHECK: add.4s v0, v0, v1 -; CHECK: addv.4s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -175,22 +176,22 @@ exit: declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define i32 @saba_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: saba_accumulation +; CHECK-LABEL: saba_accumulation: +; CHECK: sabd.4s v0, v0, v2 +; CHECK: sabd.4s v1, v1, v3 +; CHECK: sabd.4s v2, v2, v5 +; CHECK: saba.4s v0, v4, v6 +; CHECK: saba.4s v1, v7, v17 +; CHECK: saba.4s v2, v16, v18 +; CHECK: saba.4s v0, v19, v21 +; CHECK: saba.4s v1, v20, v22 +; CHECK: add.4s v0, v2, v0 +; CHECK: add.4s v0, v0, v1 +; CHECK: addv.4s s0, v0 entry: br label %loop loop: -; CHECK: sabd.4s v0 -; CHECK: sabd.4s v1 -; CHECK: sabd.4s v2 -; CHECK: saba.4s v0 -; CHECK: saba.4s v1 -; CHECK: saba.4s v2 -; CHECK: saba.4s v0 -; CHECK: saba.4s v1 -; CHECK: add.4s v0, v2, v0 -; CHECK: add.4s v0, v0, v1 -; CHECK: addv.4s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -203,11 +204,11 @@ loop: %vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b) %acc_next = add <4 x i32> %acc_phi, %vabd ; Increment loop counter and check the bound - %next_i = add i32 %i, 4 - %cmp = icmp slt i32 %next_i, 32 + %next_i = add i32 %i, 4 + %cmp = icmp slt i32 %next_i, 32 br i1 %cmp, label %loop, label %exit -exit: +exit: %reduce = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %acc_next) ret i32 %reduce } @@ -215,22 +216,22 @@ exit: declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define i32 @uaba_v2i32_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uaba_v2i32_accumulation +; CHECK-LABEL: uaba_v2i32_accumulation: +; CHECK: uabd.2s v0, v0, v2 +; CHECK: uabd.2s v1, v1, v3 +; CHECK: uabd.2s v2, v2, v5 +; CHECK: uaba.2s v0, v4, v6 +; CHECK: uaba.2s v1, v7, v17 +; CHECK: uaba.2s v2, v16, v18 +; CHECK: uaba.2s v0, v19, v21 +; CHECK: uaba.2s v1, v20, v22 +; CHECK: add.2s v0, v2, v0 +; CHECK: add.2s v0, v0, v1 +; CHECK: addp.2s v0, v0, v0 entry: br label %loop loop: -; CHECK: uabd.2s v0 -; CHECK: uabd.2s v1 -; CHECK: uabd.2s v2 -; CHECK: uaba.2s v0 -; CHECK: uaba.2s v1 -; CHECK: uaba.2s v2 -; CHECK: uaba.2s v0 -; CHECK: uaba.2s v1 -; CHECK: add.2s v0, v2, v0 -; CHECK: add.2s v0, v0, v1 -; CHECK: addp.2s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -250,22 +251,22 @@ exit: } define i8 @uaba_v8i8_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uaba_v8i8_accumulation +; CHECK-LABEL: uaba_v8i8_accumulation: +; CHECK: uabd.8b v0, v0, v2 +; CHECK: uabd.8b v1, v1, v3 +; CHECK: uabd.8b v2, v2, v5 +; CHECK: uaba.8b v0, v4, v6 +; CHECK: uaba.8b v1, v7, v17 +; CHECK: uaba.8b v2, v16, v18 +; CHECK: uaba.8b v0, v19, v21 +; CHECK: uaba.8b v1, v20, v22 +; CHECK: add.8b v0, v2, v0 +; CHECK: add.8b v0, v0, v1 +; CHECK: addv.8b b0, v0 entry: br label %loop loop: -; CHECK: uabd.8b v0 -; CHECK: uabd.8b v1 -; CHECK: uabd.8b v2 -; CHECK: uaba.8b v0 -; CHECK: uaba.8b v1 -; CHECK: uaba.8b v2 -; CHECK: uaba.8b v0 -; CHECK: uaba.8b v1 -; CHECK: add.8b v0, v2, v0 -; CHECK: add.8b v0, v0, v1 -; CHECK: addv.8b %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -285,22 +286,22 @@ exit: } define i8 @uaba_v16i8_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uaba_v16i8_accumulation +; CHECK-LABEL: uaba_v16i8_accumulation: +; CHECK: uabd.16b v0, v0, v2 +; CHECK: uabd.16b v1, v1, v3 +; CHECK: uabd.16b v2, v2, v5 +; CHECK: uaba.16b v0, v4, v6 +; CHECK: uaba.16b v1, v7, v17 +; CHECK: uaba.16b v2, v16, v18 +; CHECK: uaba.16b v0, v19, v21 +; CHECK: uaba.16b v1, v20, v22 +; CHECK: add.16b v0, v2, v0 +; CHECK: add.16b v0, v0, v1 +; CHECK: addv.16b b0, v0 entry: br label %loop loop: -; CHECK: uabd.16b v0 -; CHECK: uabd.16b v1 -; CHECK: uabd.16b v2 -; CHECK: uaba.16b v0 -; CHECK: uaba.16b v1 -; CHECK: uaba.16b v2 -; CHECK: uaba.16b v0 -; CHECK: uaba.16b v1 -; CHECK: add.16b v0, v2, v0 -; CHECK: add.16b v0, v0, v1 -; CHECK: addv.16b %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -320,22 +321,22 @@ exit: } define i16 @uaba_v8i16_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uaba_v8i16_accumulation +; CHECK-LABEL: uaba_v8i16_accumulation: +; CHECK: uabd.8h v0, v0, v2 +; CHECK: uabd.8h v1, v1, v3 +; CHECK: uabd.8h v2, v2, v5 +; CHECK: uaba.8h v0, v4, v6 +; CHECK: uaba.8h v1, v7, v17 +; CHECK: uaba.8h v2, v16, v18 +; CHECK: uaba.8h v0, v19, v21 +; CHECK: uaba.8h v1, v20, v22 +; CHECK: add.8h v0, v2, v0 +; CHECK: add.8h v0, v0, v1 +; CHECK: addv.8h h0, v0 entry: br label %loop loop: -; CHECK: uabd.8h v0 -; CHECK: uabd.8h v1 -; CHECK: uabd.8h v2 -; CHECK: uaba.8h v0 -; CHECK: uaba.8h v1 -; CHECK: uaba.8h v2 -; CHECK: uaba.8h v0 -; CHECK: uaba.8h v1 -; CHECK: add.8h v0, v2, v0 -; CHECK: add.8h v0, v0, v1 -; CHECK: addv.8h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -355,22 +356,22 @@ exit: } define i8 @saba_v8i8_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: saba_v8i8_accumulation +; CHECK-LABEL: saba_v8i8_accumulation: +; CHECK: sabd.8b v0, v0, v2 +; CHECK: sabd.8b v1, v1, v3 +; CHECK: sabd.8b v2, v2, v5 +; CHECK: saba.8b v0, v4, v6 +; CHECK: saba.8b v1, v7, v17 +; CHECK: saba.8b v2, v16, v18 +; CHECK: saba.8b v0, v19, v21 +; CHECK: saba.8b v1, v20, v22 +; CHECK: add.8b v0, v2, v0 +; CHECK: add.8b v0, v0, v1 +; CHECK: addv.8b b0, v0 entry: br label %loop loop: -; CHECK: sabd.8b v0 -; CHECK: sabd.8b v1 -; CHECK: sabd.8b v2 -; CHECK: saba.8b v0 -; CHECK: saba.8b v1 -; CHECK: saba.8b v2 -; CHECK: saba.8b v0 -; CHECK: saba.8b v1 -; CHECK: add.8b v0, v2, v0 -; CHECK: add.8b v0, v0, v1 -; CHECK: addv.8b %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -390,21 +391,21 @@ exit: } define i16 @saba_v4i16_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: saba_v4i16_accumulation +; CHECK-LABEL: saba_v4i16_accumulation: +; CHECK: sabd.4h v0, v0, v2 +; CHECK: sabd.4h v1, v1, v3 +; CHECK: sabd.4h v2, v2, v5 +; CHECK: saba.4h v0, v4, v6 +; CHECK: saba.4h v1, v7, v17 +; CHECK: saba.4h v2, v16, v18 +; CHECK: saba.4h v0, v19, v21 +; CHECK: saba.4h v1, v20, v22 +; CHECK: add.4h v0, v2, v0 +; CHECK: add.4h v0, v0, v1 +; CHECK: addv.4h h0, v0 entry: br label %loop loop: -; CHECK: sabd.4h v0 -; CHECK: sabd.4h v1 -; CHECK: sabd.4h v2 -; CHECK: saba.4h v0 -; CHECK: saba.4h v1 -; CHECK: saba.4h v2 -; CHECK: saba.4h v0 -; CHECK: saba.4h v1 -; CHECK: add.4h v0, v2, v0 -; CHECK: add.4h v0, v0, v1 -; CHECK: addv.4h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -423,22 +424,22 @@ exit: } define i16 @saba_v8i16_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: saba_v8i16_accumulation +; CHECK-LABEL: saba_v8i16_accumulation: +; CHECK: sabd.8h v0, v0, v2 +; CHECK: sabd.8h v1, v1, v3 +; CHECK: sabd.8h v2, v2, v5 +; CHECK: saba.8h v0, v4, v6 +; CHECK: saba.8h v1, v7, v17 +; CHECK: saba.8h v2, v16, v18 +; CHECK: saba.8h v0, v19, v21 +; CHECK: saba.8h v1, v20, v22 +; CHECK: add.8h v0, v2, v0 +; CHECK: add.8h v0, v0, v1 +; CHECK: addv.8h h0, v0 entry: br label %loop loop: -; CHECK: sabd.8h v0 -; CHECK: sabd.8h v1 -; CHECK: sabd.8h v2 -; CHECK: saba.8h v0 -; CHECK: saba.8h v1 -; CHECK: saba.8h v2 -; CHECK: saba.8h v0 -; CHECK: saba.8h v1 -; CHECK: add.8h v0, v2, v0 -; CHECK: add.8h v0, v0, v1 -; CHECK: addv.8h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -458,22 +459,22 @@ exit: } define i16 @uabal_i8_to_i16_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uabal_i8_to_i16_accumulation +; CHECK-LABEL: uabal_i8_to_i16_accumulation: +; CHECK: uabdl.8h v1, v1, v3 +; CHECK: uabdl.8h v0, v0, v2 +; CHECK: uabdl.8h v2, v3, v5 +; CHECK: uabal.8h v1, v4, v6 +; CHECK: uabal.8h v0, v7, v17 +; CHECK: uabal.8h v2, v16, v18 +; CHECK: uabal.8h v1, v19, v21 +; CHECK: uabal.8h v0, v20, v22 +; CHECK: add.8h v1, v2, v1 +; CHECK: add.8h v0, v1, v0 +; CHECK: addv.8h h0, v0 entry: br label %loop loop: -; CHECK: uabdl.8h v1 -; CHECK: uabdl.8h v0 -; CHECK: uabdl.8h v2 -; CHECK: uabal.8h v1 -; CHECK: uabal.8h v0 -; CHECK: uabal.8h v2 -; CHECK: uabal.8h v1 -; CHECK: uabal.8h v0 -; CHECK: add.8h v1, v2, v1 -; CHECK: add.8h v0, v1, v0 -; CHECK: addv.8h %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] @@ -494,22 +495,22 @@ exit: } define i32 @uabal_i16_to_i32_accumulation(ptr %ptr1, ptr %ptr2) { -; CHECK-LABEL: uabal_i16_to_i32_accumulation +; CHECK-LABEL: uabal_i16_to_i32_accumulation: +; CHECK: uabdl.4s v1, v1, v3 +; CHECK: uabdl.4s v0, v0, v2 +; CHECK: uabdl.4s v2, v3, v5 +; CHECK: uabal.4s v1, v4, v6 +; CHECK: uabal.4s v0, v7, v17 +; CHECK: uabal.4s v2, v16, v18 +; CHECK: uabal.4s v1, v19, v21 +; CHECK: uabal.4s v0, v20, v22 +; CHECK: add.4s v1, v2, v1 +; CHECK: add.4s v0, v1, v0 +; CHECK: addv.4s s0, v0 entry: br label %loop loop: -; CHECK: uabdl.4s v1 -; CHECK: uabdl.4s v0 -; CHECK: uabdl.4s v2 -; CHECK: uabal.4s v1 -; CHECK: uabal.4s v0 -; CHECK: uabal.4s v2 -; CHECK: uabal.4s v1 -; CHECK: uabal.4s v0 -; CHECK: add.4s v1, v2, v1 -; CHECK: add.4s v0, v1, v0 -; CHECK: addv.4s %i = phi i32 [ 0, %entry ], [ %next_i, %loop ] %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ]