diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 505fae4e840f7..e455fabfe2e8d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18177,16 +18177,38 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, unsigned ExtOpcode = Op0.getOpcode(); SDValue A = Op0; SDValue B; + unsigned DotOpcode; if (ExtOpcode == ISD::MUL) { A = Op0.getOperand(0); B = Op0.getOperand(1); - if (A.getOpcode() != B.getOpcode() || - A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) + if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType()) return SDValue(); - ExtOpcode = A.getOpcode(); - } - if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND) + auto OpCodeA = A.getOpcode(); + if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND) + return SDValue(); + + auto OpCodeB = B.getOpcode(); + if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND) + return SDValue(); + + if (OpCodeA == OpCodeB) { + DotOpcode = + OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT; + } else { + // Check USDOT support support + if (!ST->hasMatMulInt8()) + return SDValue(); + DotOpcode = AArch64ISD::USDOT; + if (OpCodeA == ISD::SIGN_EXTEND) + std::swap(A, B); + } + } else if (ExtOpcode == ISD::ZERO_EXTEND) { + DotOpcode = AArch64ISD::UDOT; + } else if (ExtOpcode == ISD::SIGN_EXTEND) { + DotOpcode = AArch64ISD::SDOT; + } else { return SDValue(); + } EVT Op0VT = A.getOperand(0).getValueType(); bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0; @@ -18212,8 +18234,6 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, NumOfVecReduce = Op0VT.getVectorNumElements() / 8; TargetType = MVT::v2i32; } - auto DotOpcode = - (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT; // Handle the case where we need to generate only one Dot operation. if (NumOfVecReduce == 1) { SDValue Zeros = DAG.getConstant(0, DL, TargetType); diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index c345c1e50bbbb..748555d7bdfa1 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; CHECK-GI: warning: Instruction selection used fallback path for test_udot_v5i8 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v5i8_nomla @@ -290,6 +290,128 @@ entry: ret i32 %x } +define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { +; CHECK-SD-LABEL: test_usdot_v4i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: add w0, w8, w2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v4i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: uxtb w8, w8 +; CHECK-GI-NEXT: sxtb w9, w9 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b5, v2.b[2] +; CHECK-GI-NEXT: mov b4, v0.b[3] +; CHECK-GI-NEXT: mov b0, v2.b[1] +; CHECK-GI-NEXT: mov b6, v2.b[3] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: fmov w10, s1 +; CHECK-GI-NEXT: fmov w11, s3 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov w13, s5 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w12, s0 +; CHECK-GI-NEXT: uxtb w10, w10 +; CHECK-GI-NEXT: uxtb w11, w11 +; CHECK-GI-NEXT: sxtb w13, w13 +; CHECK-GI-NEXT: uxtb w8, w8 +; CHECK-GI-NEXT: sxtb w12, w12 +; CHECK-GI-NEXT: mov v1.h[1], w10 +; CHECK-GI-NEXT: fmov w10, s6 +; CHECK-GI-NEXT: fmov s0, w11 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: mov v2.h[1], w12 +; CHECK-GI-NEXT: sxtb w10, w10 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v3.h[1], w10 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mov v1.d[1], v0.d[0] +; CHECK-GI-NEXT: mov v2.d[1], v3.d[0] +; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ret +entry: + %0 = load <4 x i8>, ptr %a + %1 = zext <4 x i8> %0 to <4 x i32> + %2 = load <4 x i8>, ptr %b + %3 = sext <4 x i8> %2 to <4 x i32> + %4 = mul nsw <4 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) + %op.extra = add nsw i32 %5, %sum + ret i32 %op.extra +} + +define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_v4i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24 +; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24 +; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24 +; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v2.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v4i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: movi v4.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24 +; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24 +; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24 +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret +entry: + %az = zext <4 x i8> %a to <4 x i32> + %bz = sext <4 x i8> %b to <4 x i32> + %m1 = mul nuw nsw <4 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1) + %cz = zext <4 x i8> %c to <4 x i32> + %dz = sext <4 x i8> %d to <4 x i32> + %m2 = mul nuw nsw <4 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v5i8: ; CHECK: // %bb.0: // %entry @@ -508,6 +630,77 @@ entry: ret i32 %2 } +define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { +; CHECK-SD-LABEL: test_usdot_v8i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d2, [x1] +; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b +; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v8i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %a + %1 = zext <8 x i8> %0 to <8 x i32> + %2 = load <8 x i8>, ptr %b + %3 = sext <8 x i8> %2 to <8 x i32> + %4 = mul nsw <8 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) + ret i32 %5 +} + +define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) { +; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldr d1, [x0] +; CHECK-SD-NEXT: ldr d2, [x1] +; CHECK-SD-NEXT: usdot v0.2s, v2.8b, v1.8b +; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret +entry: + %0 = load <8 x i8>, ptr %a + %1 = sext <8 x i8> %0 to <8 x i32> + %2 = load <8 x i8>, ptr %b + %3 = zext <8 x i8> %2 to <8 x i32> + %4 = mul nsw <8 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) + ret i32 %5 +} define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v16i8: @@ -587,6 +780,101 @@ entry: ret i32 %2 } +define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { +; CHECK-SD-LABEL: test_usdot_v16i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q2, [x1] +; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: add w0, w8, w2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v16i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s +; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s +; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ret +entry: + %0 = load <16 x i8>, ptr %a + %1 = zext <16 x i8> %0 to <16 x i32> + %2 = load <16 x i8>, ptr %b + %3 = sext <16 x i8> %2 to <16 x i32> + %4 = mul nsw <16 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) + %op.extra = add nsw i32 %5, %sum + ret i32 %op.extra +} + +define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { +; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ldr q1, [x0] +; CHECK-SD-NEXT: ldr q2, [x1] +; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v1.16b +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: add w0, w8, w2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr q0, [x0] +; CHECK-GI-NEXT: ldr q1, [x1] +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v6.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s +; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s +; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s +; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ret +entry: + %0 = load <16 x i8>, ptr %a + %1 = sext <16 x i8> %0 to <16 x i32> + %2 = load <16 x i8>, ptr %b + %3 = zext <16 x i8> %2 to <16 x i32> + %4 = mul nsw <16 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) + %op.extra = add nsw i32 %5, %sum + ret i32 %op.extra +} define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; CHECK-SD-LABEL: test_udot_v8i8_double: @@ -860,6 +1148,240 @@ entry: ret i32 %x } + +define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_v8i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b +; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b +; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s +; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v8i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll2 v6.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s +; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v4.4s +; CHECK-GI-NEXT: addv s1, v5.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret +entry: + %az = zext <8 x i8> %a to <8 x i32> + %bz = sext <8 x i8> %b to <8 x i32> + %m1 = mul nuw nsw <8 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) + %cz = zext <8 x i8> %c to <8 x i32> + %dz = sext <8 x i8> %d to <8 x i32> + %m2 = mul nuw nsw <8 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + +define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b +; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b +; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s +; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s +; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v4.4s +; CHECK-GI-NEXT: addv s1, v5.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret +entry: + %az = sext <8 x i8> %a to <8 x i32> + %bz = zext <8 x i8> %b to <8 x i32> + %m1 = mul nuw nsw <8 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1) + %cz = sext <8 x i8> %c to <8 x i32> + %dz = zext <8 x i8> %d to <8 x i32> + %m2 = mul nuw nsw <8 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + +define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_v16i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v16i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: ushll2 v16.4s, v4.8h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 +; CHECK-GI-NEXT: sshll2 v21.4s, v7.8h, #0 +; CHECK-GI-NEXT: ushll2 v22.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll2 v23.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s +; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s +; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s +; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s +; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret +entry: + %az = zext <16 x i8> %a to <16 x i32> + %bz = sext <16 x i8> %b to <16 x i32> + %m1 = mul nuw nsw <16 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) + %cz = zext <16 x i8> %c to <16 x i32> + %dz = sext <16 x i8> %d to <16 x i32> + %m2 = mul nuw nsw <16 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + + +define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b +; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v6.8h, v2.8b, #0 +; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 +; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v18.4s, v5.8h, #0 +; CHECK-GI-NEXT: ushll2 v19.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll2 v20.4s, v6.8h, #0 +; CHECK-GI-NEXT: ushll2 v21.4s, v7.8h, #0 +; CHECK-GI-NEXT: sshll2 v22.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s +; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s +; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s +; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s +; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret +entry: + %az = sext <16 x i8> %a to <16 x i32> + %bz = zext <16 x i8> %b to <16 x i32> + %m1 = mul nuw nsw <16 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1) + %cz = sext <16 x i8> %c to <16 x i32> + %dz = zext <16 x i8> %d to <16 x i32> + %m2 = mul nuw nsw <16 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-SD-LABEL: test_udot_v24i8: ; CHECK-SD: // %bb.0: // %entry @@ -1658,7 +2180,6 @@ entry: ret i32 %x } - define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry @@ -2301,6 +2822,202 @@ entry: ret i32 %x } +define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { +; CHECK-SD-LABEL: test_usdot_v32i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 +; CHECK-SD-NEXT: ldp q2, q3, [x0] +; CHECK-SD-NEXT: ldp q4, q5, [x1] +; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b +; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: add w0, w8, w2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v32i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldp q0, q1, [x1] +; CHECK-GI-NEXT: ldp q2, q3, [x0] +; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0 +; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0 +; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0 +; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0 +; CHECK-GI-NEXT: ushll2 v21.4s, v2.8h, #0 +; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0 +; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v16.4s, v16.4s, v20.4s +; CHECK-GI-NEXT: mul v17.4s, v17.4s, v21.4s +; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: mul v18.4s, v18.4s, v22.4s +; CHECK-GI-NEXT: mul v19.4s, v19.4s, v23.4s +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0 +; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: mla v16.4s, v4.4s, v6.4s +; CHECK-GI-NEXT: mla v17.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mla v18.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: mla v19.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s +; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ret +entry: + %0 = load <32 x i8>, ptr %a + %1 = zext <32 x i8> %0 to <32 x i32> + %2 = load <32 x i8>, ptr %b + %3 = sext <32 x i8> %2 to <32 x i32> + %4 = mul nsw <32 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) + %op.extra = add nsw i32 %5, %sum + ret i32 %op.extra +} + +define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_v32i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v16.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v17.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 +; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b +; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b +; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b +; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b +; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s +; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v32i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset b8, -8 +; CHECK-GI-NEXT: .cfi_offset b9, -16 +; CHECK-GI-NEXT: .cfi_offset b10, -24 +; CHECK-GI-NEXT: .cfi_offset b11, -32 +; CHECK-GI-NEXT: .cfi_offset b12, -40 +; CHECK-GI-NEXT: .cfi_offset b13, -48 +; CHECK-GI-NEXT: .cfi_offset b14, -56 +; CHECK-GI-NEXT: .cfi_offset b15, -64 +; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0 +; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0 +; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 +; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0 +; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0 +; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0 +; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0 +; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0 +; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0 +; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0 +; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0 +; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0 +; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0 +; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0 +; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0 +; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0 +; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0 +; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 +; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0 +; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0 +; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0 +; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0 +; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s +; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s +; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s +; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s +; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s +; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s +; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 +; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s +; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s +; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 +; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0 +; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0 +; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0 +; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s +; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s +; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s +; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s +; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s +; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s +; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s +; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s +; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %az = zext <32 x i8> %a to <32 x i32> + %bz = sext <32 x i8> %b to <32 x i32> + %m1 = mul nuw nsw <32 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1) + %cz = zext <32 x i8> %c to <32 x i32> + %dz = sext <32 x i8> %d to <32 x i32> + %m2 = mul nuw nsw <32 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +} + + define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry @@ -2866,6 +3583,7 @@ entry: %x = add i32 %r1, %r2 ret i32 %x } + define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-SD-LABEL: test_udot_v48i8: ; CHECK-SD: // %bb.0: // %entry @@ -4527,3 +5245,385 @@ entry: %x = add i32 %r1, %r2 ret i32 %x } + +define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { +; CHECK-SD-LABEL: test_usdot_v64i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v4.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: ldp q1, q2, [x0, #32] +; CHECK-SD-NEXT: ldp q6, q7, [x1, #32] +; CHECK-SD-NEXT: ldp q16, q17, [x0] +; CHECK-SD-NEXT: ldp q18, q19, [x1] +; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b +; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b +; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b +; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b +; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s +; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s +; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: add w0, w8, w2 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v64i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset b8, -8 +; CHECK-GI-NEXT: .cfi_offset b9, -16 +; CHECK-GI-NEXT: .cfi_offset b10, -24 +; CHECK-GI-NEXT: .cfi_offset b11, -32 +; CHECK-GI-NEXT: .cfi_offset b12, -40 +; CHECK-GI-NEXT: .cfi_offset b13, -48 +; CHECK-GI-NEXT: .cfi_offset b14, -56 +; CHECK-GI-NEXT: .cfi_offset b15, -64 +; CHECK-GI-NEXT: ldp q0, q1, [x1] +; CHECK-GI-NEXT: ldp q21, q17, [x0] +; CHECK-GI-NEXT: ldp q3, q19, [x1, #32] +; CHECK-GI-NEXT: ldp q18, q4, [x0, #32] +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v5.8h, v0.16b, #0 +; CHECK-GI-NEXT: sshll v7.8h, v1.8b, #0 +; CHECK-GI-NEXT: sshll2 v22.8h, v1.16b, #0 +; CHECK-GI-NEXT: sshll v23.8h, v3.8b, #0 +; CHECK-GI-NEXT: sshll2 v24.8h, v3.16b, #0 +; CHECK-GI-NEXT: sshll v25.8h, v19.8b, #0 +; CHECK-GI-NEXT: sshll2 v26.8h, v19.16b, #0 +; CHECK-GI-NEXT: ushll v27.8h, v21.8b, #0 +; CHECK-GI-NEXT: ushll2 v28.8h, v21.16b, #0 +; CHECK-GI-NEXT: ushll v30.8h, v17.8b, #0 +; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0 +; CHECK-GI-NEXT: ushll v8.8h, v18.8b, #0 +; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0 +; CHECK-GI-NEXT: ushll v9.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0 +; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v5.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v5.8h, #0 +; CHECK-GI-NEXT: sshll v2.4s, v7.4h, #0 +; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0 +; CHECK-GI-NEXT: sshll v3.4s, v22.4h, #0 +; CHECK-GI-NEXT: sshll2 v22.4s, v22.8h, #0 +; CHECK-GI-NEXT: sshll v5.4s, v23.4h, #0 +; CHECK-GI-NEXT: sshll2 v23.4s, v23.8h, #0 +; CHECK-GI-NEXT: sshll v7.4s, v24.4h, #0 +; CHECK-GI-NEXT: sshll2 v24.4s, v24.8h, #0 +; CHECK-GI-NEXT: sshll v19.4s, v25.4h, #0 +; CHECK-GI-NEXT: sshll2 v25.4s, v25.8h, #0 +; CHECK-GI-NEXT: sshll v21.4s, v26.4h, #0 +; CHECK-GI-NEXT: sshll2 v26.4s, v26.8h, #0 +; CHECK-GI-NEXT: ushll v29.4s, v27.4h, #0 +; CHECK-GI-NEXT: ushll2 v27.4s, v27.8h, #0 +; CHECK-GI-NEXT: ushll v31.4s, v28.4h, #0 +; CHECK-GI-NEXT: ushll2 v28.4s, v28.8h, #0 +; CHECK-GI-NEXT: ushll v10.4s, v30.4h, #0 +; CHECK-GI-NEXT: ushll2 v30.4s, v30.8h, #0 +; CHECK-GI-NEXT: ushll v11.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-GI-NEXT: ushll2 v12.4s, v8.8h, #0 +; CHECK-GI-NEXT: ushll2 v13.4s, v18.8h, #0 +; CHECK-GI-NEXT: ushll2 v14.4s, v9.8h, #0 +; CHECK-GI-NEXT: ushll2 v15.4s, v4.8h, #0 +; CHECK-GI-NEXT: mul v6.4s, v6.4s, v27.4s +; CHECK-GI-NEXT: mul v16.4s, v16.4s, v28.4s +; CHECK-GI-NEXT: mul v20.4s, v20.4s, v30.4s +; CHECK-GI-NEXT: mul v17.4s, v22.4s, v17.4s +; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 +; CHECK-GI-NEXT: mul v22.4s, v23.4s, v12.4s +; CHECK-GI-NEXT: mul v23.4s, v24.4s, v13.4s +; CHECK-GI-NEXT: mul v24.4s, v25.4s, v14.4s +; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mul v25.4s, v26.4s, v15.4s +; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 +; CHECK-GI-NEXT: ushll v26.4s, v9.4h, #0 +; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: mla v6.4s, v0.4s, v29.4s +; CHECK-GI-NEXT: mla v16.4s, v1.4s, v31.4s +; CHECK-GI-NEXT: mla v20.4s, v2.4s, v10.4s +; CHECK-GI-NEXT: mla v17.4s, v3.4s, v11.4s +; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mla v22.4s, v5.4s, v8.4s +; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mla v23.4s, v7.4s, v18.4s +; CHECK-GI-NEXT: mla v24.4s, v19.4s, v26.4s +; CHECK-GI-NEXT: mla v25.4s, v21.4s, v4.4s +; CHECK-GI-NEXT: add v0.4s, v6.4s, v16.4s +; CHECK-GI-NEXT: add v1.4s, v20.4s, v17.4s +; CHECK-GI-NEXT: add v2.4s, v22.4s, v23.4s +; CHECK-GI-NEXT: add v3.4s, v24.4s, v25.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %0 = load <64 x i8>, ptr %a + %1 = zext <64 x i8> %0 to <64 x i32> + %2 = load <64 x i8>, ptr %b + %3 = sext <64 x i8> %2 to <64 x i32> + %4 = mul nsw <64 x i32> %3, %1 + %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4) + %op.extra = add nsw i32 %5, %sum + ret i32 %op.extra +} + +define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { +; CHECK-SD-LABEL: test_usdot_v64i8_double: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v18.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v21.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v22.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v23.2d, #0000000000000000 +; CHECK-SD-NEXT: ldp q16, q17, [sp, #64] +; CHECK-SD-NEXT: movi v24.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v25.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v26.2d, #0000000000000000 +; CHECK-SD-NEXT: movi v27.2d, #0000000000000000 +; CHECK-SD-NEXT: ldp q19, q20, [sp, #96] +; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b +; CHECK-SD-NEXT: ldp q3, q7, [sp, #32] +; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b +; CHECK-SD-NEXT: ldp q1, q5, [sp] +; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b +; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b +; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b +; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b +; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b +; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b +; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s +; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s +; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s +; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s +; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_usdot_v64i8_double: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #304 +; CHECK-GI-NEXT: stp d15, d14, [sp, #224] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d13, d12, [sp, #240] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d11, d10, [sp, #256] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp d9, d8, [sp, #272] // 16-byte Folded Spill +; CHECK-GI-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 304 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: .cfi_offset b8, -24 +; CHECK-GI-NEXT: .cfi_offset b9, -32 +; CHECK-GI-NEXT: .cfi_offset b10, -40 +; CHECK-GI-NEXT: .cfi_offset b11, -48 +; CHECK-GI-NEXT: .cfi_offset b12, -56 +; CHECK-GI-NEXT: .cfi_offset b13, -64 +; CHECK-GI-NEXT: .cfi_offset b14, -72 +; CHECK-GI-NEXT: .cfi_offset b15, -80 +; CHECK-GI-NEXT: ushll v17.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-GI-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; CHECK-GI-NEXT: mov v20.16b, v3.16b +; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll2 v18.8h, v1.16b, #0 +; CHECK-GI-NEXT: ushll v26.8h, v2.8b, #0 +; CHECK-GI-NEXT: ldp q27, q28, [sp, #304] +; CHECK-GI-NEXT: ushll2 v29.8h, v2.16b, #0 +; CHECK-GI-NEXT: ushll v2.4s, v17.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v8.8h, v4.8b, #0 +; CHECK-GI-NEXT: ldp q23, q21, [sp, #368] +; CHECK-GI-NEXT: sshll2 v9.8h, v4.16b, #0 +; CHECK-GI-NEXT: sshll2 v11.8h, v5.16b, #0 +; CHECK-GI-NEXT: mov v25.16b, v7.16b +; CHECK-GI-NEXT: ushll2 v19.4s, v17.8h, #0 +; CHECK-GI-NEXT: stp q1, q2, [sp, #192] // 32-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll2 v17.4s, v18.8h, #0 +; CHECK-GI-NEXT: ldp q24, q22, [sp, #336] +; CHECK-GI-NEXT: sshll v10.8h, v5.8b, #0 +; CHECK-GI-NEXT: sshll v12.8h, v6.8b, #0 +; CHECK-GI-NEXT: sshll2 v13.8h, v6.16b, #0 +; CHECK-GI-NEXT: mov v2.16b, v20.16b +; CHECK-GI-NEXT: sshll2 v0.4s, v8.8h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v9.8h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v11.8h, #0 +; CHECK-GI-NEXT: ushll2 v7.4s, v16.8h, #0 +; CHECK-GI-NEXT: ushll2 v31.4s, v29.8h, #0 +; CHECK-GI-NEXT: sshll2 v5.4s, v10.8h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v13.8h, #0 +; CHECK-GI-NEXT: ushll2 v30.4s, v26.8h, #0 +; CHECK-GI-NEXT: ushll v14.8h, v2.8b, #0 +; CHECK-GI-NEXT: mul v20.4s, v19.4s, v0.4s +; CHECK-GI-NEXT: mul v19.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: sshll v0.8h, v25.8b, #0 +; CHECK-GI-NEXT: mul v4.4s, v17.4s, v6.4s +; CHECK-GI-NEXT: sshll2 v15.4s, v12.8h, #0 +; CHECK-GI-NEXT: ldp q17, q3, [sp, #400] +; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s +; CHECK-GI-NEXT: mul v7.4s, v31.4s, v1.4s +; CHECK-GI-NEXT: ushll2 v31.8h, v2.16b, #0 +; CHECK-GI-NEXT: sshll2 v25.8h, v25.16b, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v14.4h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: str q3, [sp, #96] // 16-byte Folded Spill +; CHECK-GI-NEXT: ushll2 v3.4s, v14.8h, #0 +; CHECK-GI-NEXT: mul v6.4s, v30.4s, v15.4s +; CHECK-GI-NEXT: str q31, [sp, #160] // 16-byte Folded Spill +; CHECK-GI-NEXT: ushll v30.4s, v26.4h, #0 +; CHECK-GI-NEXT: sshll v26.4s, v8.4h, #0 +; CHECK-GI-NEXT: ushll v14.8h, v27.8b, #0 +; CHECK-GI-NEXT: ushll v15.4s, v29.4h, #0 +; CHECK-GI-NEXT: sshll v29.4s, v9.4h, #0 +; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s +; CHECK-GI-NEXT: ushll2 v3.4s, v31.8h, #0 +; CHECK-GI-NEXT: ushll v31.8h, v28.8b, #0 +; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0 +; CHECK-GI-NEXT: sshll v8.4s, v10.4h, #0 +; CHECK-GI-NEXT: sshll v9.4s, v11.4h, #0 +; CHECK-GI-NEXT: sshll v10.4s, v12.4h, #0 +; CHECK-GI-NEXT: sshll v11.4s, v13.4h, #0 +; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 +; CHECK-GI-NEXT: stp q3, q25, [sp, #112] // 32-byte Folded Spill +; CHECK-GI-NEXT: ldr q3, [sp, #208] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll2 v28.8h, v28.16b, #0 +; CHECK-GI-NEXT: mla v1.4s, v2.4s, v0.4s +; CHECK-GI-NEXT: ushll2 v0.4s, v31.8h, #0 +; CHECK-GI-NEXT: mla v5.4s, v16.4s, v8.4s +; CHECK-GI-NEXT: mla v20.4s, v3.4s, v26.4s +; CHECK-GI-NEXT: sshll2 v3.4s, v25.8h, #0 +; CHECK-GI-NEXT: mla v6.4s, v30.4s, v10.4s +; CHECK-GI-NEXT: mla v7.4s, v15.4s, v11.4s +; CHECK-GI-NEXT: sshll v25.8h, v23.8b, #0 +; CHECK-GI-NEXT: mla v4.4s, v18.4s, v9.4s +; CHECK-GI-NEXT: ushll v30.8h, v22.8b, #0 +; CHECK-GI-NEXT: ushll2 v26.8h, v22.16b, #0 +; CHECK-GI-NEXT: sshll v22.8h, v21.8b, #0 +; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll2 v8.8h, v27.16b, #0 +; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q9, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll2 v1.4s, v14.8h, #0 +; CHECK-GI-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill +; CHECK-GI-NEXT: mla v19.4s, v3.4s, v29.4s +; CHECK-GI-NEXT: sshll2 v7.4s, v25.8h, #0 +; CHECK-GI-NEXT: str q5, [sp, #176] // 16-byte Folded Spill +; CHECK-GI-NEXT: ushll v29.8h, v24.8b, #0 +; CHECK-GI-NEXT: ushll2 v27.8h, v24.16b, #0 +; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: ldp q0, q16, [sp, #96] // 32-byte Folded Reload +; CHECK-GI-NEXT: str q4, [sp, #144] // 16-byte Folded Spill +; CHECK-GI-NEXT: sshll2 v24.8h, v23.16b, #0 +; CHECK-GI-NEXT: ushll2 v18.4s, v26.8h, #0 +; CHECK-GI-NEXT: stp q19, q20, [sp, #192] // 32-byte Folded Spill +; CHECK-GI-NEXT: sshll2 v20.8h, v21.16b, #0 +; CHECK-GI-NEXT: sshll v21.8h, v17.8b, #0 +; CHECK-GI-NEXT: sshll2 v19.8h, v17.16b, #0 +; CHECK-GI-NEXT: sshll2 v17.8h, v0.16b, #0 +; CHECK-GI-NEXT: mul v16.4s, v16.4s, v9.4s +; CHECK-GI-NEXT: ldr q9, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: sshll v23.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v22.8h, #0 +; CHECK-GI-NEXT: ushll2 v12.4s, v27.8h, #0 +; CHECK-GI-NEXT: ushll v26.4s, v26.4h, #0 +; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v17.8h, #0 +; CHECK-GI-NEXT: mul v7.4s, v9.4s, v7.4s +; CHECK-GI-NEXT: ldr q9, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: sshll2 v5.4s, v19.8h, #0 +; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v20.8h, #0 +; CHECK-GI-NEXT: mul v2.4s, v9.4s, v2.4s +; CHECK-GI-NEXT: ldr q9, [sp, #128] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll2 v15.4s, v8.8h, #0 +; CHECK-GI-NEXT: mul v0.4s, v18.4s, v0.4s +; CHECK-GI-NEXT: ldr q18, [sp, #160] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll2 v11.4s, v29.8h, #0 +; CHECK-GI-NEXT: sshll v9.4s, v9.4h, #0 +; CHECK-GI-NEXT: ushll2 v13.4s, v30.8h, #0 +; CHECK-GI-NEXT: sshll2 v1.4s, v24.8h, #0 +; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v21.8h, #0 +; CHECK-GI-NEXT: sshll2 v6.4s, v23.8h, #0 +; CHECK-GI-NEXT: mul v5.4s, v12.4s, v5.4s +; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0 +; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0 +; CHECK-GI-NEXT: mla v0.4s, v26.4s, v17.4s +; CHECK-GI-NEXT: mul v3.4s, v10.4s, v3.4s +; CHECK-GI-NEXT: mul v1.4s, v15.4s, v1.4s +; CHECK-GI-NEXT: mla v16.4s, v18.4s, v9.4s +; CHECK-GI-NEXT: ldp q18, q17, [sp, #192] // 32-byte Folded Reload +; CHECK-GI-NEXT: mul v4.4s, v11.4s, v4.4s +; CHECK-GI-NEXT: mul v6.4s, v13.4s, v6.4s +; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0 +; CHECK-GI-NEXT: ldp d13, d12, [sp, #240] // 16-byte Folded Reload +; CHECK-GI-NEXT: sshll v20.4s, v20.4h, #0 +; CHECK-GI-NEXT: ushll v10.4s, v14.4h, #0 +; CHECK-GI-NEXT: ldp d15, d14, [sp, #224] // 16-byte Folded Reload +; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0 +; CHECK-GI-NEXT: ushll v31.4s, v31.4h, #0 +; CHECK-GI-NEXT: ushll v29.4s, v29.4h, #0 +; CHECK-GI-NEXT: ushll v30.4s, v30.4h, #0 +; CHECK-GI-NEXT: sshll v25.4s, v25.4h, #0 +; CHECK-GI-NEXT: sshll v24.4s, v24.4h, #0 +; CHECK-GI-NEXT: sshll v22.4s, v22.4h, #0 +; CHECK-GI-NEXT: sshll v21.4s, v21.4h, #0 +; CHECK-GI-NEXT: sshll v23.4s, v23.4h, #0 +; CHECK-GI-NEXT: mla v5.4s, v27.4s, v19.4s +; CHECK-GI-NEXT: ldr q19, [sp, #144] // 16-byte Folded Reload +; CHECK-GI-NEXT: add v17.4s, v17.4s, v18.4s +; CHECK-GI-NEXT: ldr q18, [sp, #176] // 16-byte Folded Reload +; CHECK-GI-NEXT: mla v3.4s, v28.4s, v20.4s +; CHECK-GI-NEXT: mla v7.4s, v10.4s, v25.4s +; CHECK-GI-NEXT: ldp d11, d10, [sp, #256] // 16-byte Folded Reload +; CHECK-GI-NEXT: mla v1.4s, v8.4s, v24.4s +; CHECK-GI-NEXT: ldp d9, d8, [sp, #272] // 16-byte Folded Reload +; CHECK-GI-NEXT: add v18.4s, v18.4s, v19.4s +; CHECK-GI-NEXT: ldp q20, q19, [sp, #64] // 32-byte Folded Reload +; CHECK-GI-NEXT: mla v2.4s, v31.4s, v22.4s +; CHECK-GI-NEXT: mla v4.4s, v29.4s, v21.4s +; CHECK-GI-NEXT: mla v6.4s, v30.4s, v23.4s +; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-GI-NEXT: add v19.4s, v19.4s, v20.4s +; CHECK-GI-NEXT: ldr q20, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v16.4s, v20.4s, v16.4s +; CHECK-GI-NEXT: add v3.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v5.4s, v19.4s, v16.4s +; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: addv s1, v2.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: add sp, sp, #304 +; CHECK-GI-NEXT: ret +entry: + %az = zext <64 x i8> %a to <64 x i32> + %bz = sext <64 x i8> %b to <64 x i32> + %m1 = mul nuw nsw <64 x i32> %az, %bz + %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1) + %cz = zext <64 x i8> %c to <64 x i32> + %dz = sext <64 x i8> %d to <64 x i32> + %m2 = mul nuw nsw <64 x i32> %cz, %dz + %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2) + %x = add i32 %r1, %r2 + ret i32 %x +}