diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 92053ed561901..4cd378f9aa595 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -1741,6 +1741,17 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { LLVM_DEBUG( dbgs() << "Identified single reduction starting from instruction: " << *Real << "/" << *ReductionInfo[Real].second << "\n"); + + // Reducing to a single vector is not supported, only permit reducing down + // to scalar values. + // Doing this here will leave the prior node in the graph, + // however with no uses the node will be unreachable by the replacement + // process. That along with the usage outside the graph should prevent the + // replacement process from kicking off at all for this graph. + // TODO Add support for reducing to a single vector value + if (ReductionInfo[Real].second->getType()->isVectorTy()) + continue; + Processed[i] = true; auto RootNode = prepareCompositeNode( ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll new file mode 100644 index 0000000000000..faefaf9bad7b1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-unrolled-cdot.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE +; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp_i8_rot0( %a0, %b0, %a1, %b1) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE2-SAME: [[A0:%.*]], [[B0:%.*]], [[A1:%.*]], [[B1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[VEC_PHI25:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A0]]) +; CHECK-SVE2-NEXT: [[A0_REAL:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A0_IMAG:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A1]]) +; CHECK-SVE2-NEXT: [[A1_REAL:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A1_IMAG:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A0_REAL_EXT:%.*]] = sext [[A0_REAL]] to +; CHECK-SVE2-NEXT: [[A1_REAL_EXT:%.*]] = sext [[A1_REAL]] to +; CHECK-SVE2-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B0]]) +; CHECK-SVE2-NEXT: [[B0_REAL:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B0_IMAG:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B1]]) +; CHECK-SVE2-NEXT: [[B1_REAL:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B1_IMAG:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B0_REAL_EXT:%.*]] = sext [[B0_REAL]] to +; CHECK-SVE2-NEXT: [[B1_REAL_EXT:%.*]] = sext [[B1_REAL]] to +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = mul nsw [[B0_REAL_EXT]], [[A0_REAL_EXT]] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = mul nsw [[B1_REAL_EXT]], [[A1_REAL_EXT]] +; CHECK-SVE2-NEXT: [[A0_IMAG_EXT:%.*]] = sext [[A0_IMAG]] to +; CHECK-SVE2-NEXT: [[A1_IMAG_EXT:%.*]] = sext [[A1_IMAG]] to +; CHECK-SVE2-NEXT: [[B0_IMAG_EXT:%.*]] = sext [[B0_IMAG]] to +; CHECK-SVE2-NEXT: [[B1_IMAG_EXT:%.*]] = sext [[B1_IMAG]] to +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = mul nsw [[B0_IMAG_EXT]], [[A0_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = mul nsw [[B1_IMAG_EXT]], [[A1_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP0]]) +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI25]], [[TMP1]]) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = sub nsw zeroinitializer, [[TMP2]] +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = sub nsw zeroinitializer, [[TMP3]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE33]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP4]]) +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE34]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE32]], [[TMP5]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]] +; CHECK-SVE2-NEXT: [[TMP23:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP23]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE-SAME: [[A0:%.*]], [[B0:%.*]], [[A1:%.*]], [[B1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI25:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A0]]) +; CHECK-SVE-NEXT: [[A0_REAL:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A0_IMAG:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A1]]) +; CHECK-SVE-NEXT: [[A1_REAL:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A1_IMAG:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A0_REAL_EXT:%.*]] = sext [[A0_REAL]] to +; CHECK-SVE-NEXT: [[A1_REAL_EXT:%.*]] = sext [[A1_REAL]] to +; CHECK-SVE-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B0]]) +; CHECK-SVE-NEXT: [[B0_REAL:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B0_IMAG:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B1]]) +; CHECK-SVE-NEXT: [[B1_REAL:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B1_IMAG:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B0_REAL_EXT:%.*]] = sext [[B0_REAL]] to +; CHECK-SVE-NEXT: [[B1_REAL_EXT:%.*]] = sext [[B1_REAL]] to +; CHECK-SVE-NEXT: [[TMP0:%.*]] = mul nsw [[B0_REAL_EXT]], [[A0_REAL_EXT]] +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul nsw [[B1_REAL_EXT]], [[A1_REAL_EXT]] +; CHECK-SVE-NEXT: [[A0_IMAG_EXT:%.*]] = sext [[A0_IMAG]] to +; CHECK-SVE-NEXT: [[A1_IMAG_EXT:%.*]] = sext [[A1_IMAG]] to +; CHECK-SVE-NEXT: [[B0_IMAG_EXT:%.*]] = sext [[B0_IMAG]] to +; CHECK-SVE-NEXT: [[B1_IMAG_EXT:%.*]] = sext [[B1_IMAG]] to +; CHECK-SVE-NEXT: [[TMP2:%.*]] = mul nsw [[B0_IMAG_EXT]], [[A0_IMAG_EXT]] +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nsw [[B1_IMAG_EXT]], [[A1_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP0]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI25]], [[TMP1]]) +; CHECK-SVE-NEXT: [[TMP4:%.*]] = sub nsw zeroinitializer, [[TMP2]] +; CHECK-SVE-NEXT: [[TMP5:%.*]] = sub nsw zeroinitializer, [[TMP3]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE33]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP4]]) +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE34]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE32]], [[TMP5]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-SVE-NEXT: ret i32 [[TMP6]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-NOSVE-SAME: [[A0:%.*]], [[B0:%.*]], [[A1:%.*]], [[B1:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE33:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[VEC_PHI25:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE34:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A0]]) +; CHECK-NOSVE-NEXT: [[A0_REAL:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A0_IMAG:%.*]] = extractvalue { , } [[A0_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[A1]]) +; CHECK-NOSVE-NEXT: [[A1_REAL:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A1_IMAG:%.*]] = extractvalue { , } [[A1_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A0_REAL_EXT:%.*]] = sext [[A0_REAL]] to +; CHECK-NOSVE-NEXT: [[A1_REAL_EXT:%.*]] = sext [[A1_REAL]] to +; CHECK-NOSVE-NEXT: [[B0_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B0]]) +; CHECK-NOSVE-NEXT: [[B0_REAL:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B0_IMAG:%.*]] = extractvalue { , } [[B0_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B1_DEINTERLEAVED:%.*]] = tail call { , } @llvm.vector.deinterleave2.nxv32i8( [[B1]]) +; CHECK-NOSVE-NEXT: [[B1_REAL:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B1_IMAG:%.*]] = extractvalue { , } [[B1_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B0_REAL_EXT:%.*]] = sext [[B0_REAL]] to +; CHECK-NOSVE-NEXT: [[B1_REAL_EXT:%.*]] = sext [[B1_REAL]] to +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = mul nsw [[B0_REAL_EXT]], [[A0_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[TMP1:%.*]] = mul nsw [[B1_REAL_EXT]], [[A1_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[A0_IMAG_EXT:%.*]] = sext [[A0_IMAG]] to +; CHECK-NOSVE-NEXT: [[A1_IMAG_EXT:%.*]] = sext [[A1_IMAG]] to +; CHECK-NOSVE-NEXT: [[B0_IMAG_EXT:%.*]] = sext [[B0_IMAG]] to +; CHECK-NOSVE-NEXT: [[B1_IMAG_EXT:%.*]] = sext [[B1_IMAG]] to +; CHECK-NOSVE-NEXT: [[TMP2:%.*]] = mul nsw [[B0_IMAG_EXT]], [[A0_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[TMP3:%.*]] = mul nsw [[B1_IMAG_EXT]], [[A1_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP0]]) +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE32:%.*]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI25]], [[TMP1]]) +; CHECK-NOSVE-NEXT: [[TMP4:%.*]] = sub nsw zeroinitializer, [[TMP2]] +; CHECK-NOSVE-NEXT: [[TMP5:%.*]] = sub nsw zeroinitializer, [[TMP3]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE33]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP4]]) +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE34]] = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE32]], [[TMP5]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE34]], [[PARTIAL_REDUCE33]] +; CHECK-NOSVE-NEXT: [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP6]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce33, %vector.body ] + %vec.phi25 = phi [ zeroinitializer, %entry ], [ %partial.reduce34, %vector.body ] + %a0.deinterleaved = tail call { , } @llvm.vector.deinterleave2.nxv32i8( %a0) + %a0.real = extractvalue { , } %a0.deinterleaved, 0 + %a0.imag = extractvalue { , } %a0.deinterleaved, 1 + %a1.deinterleaved = tail call { , } @llvm.vector.deinterleave2.nxv32i8( %a1) + %a1.real = extractvalue { , } %a1.deinterleaved, 0 + %a1.imag = extractvalue { , } %a1.deinterleaved, 1 + %a0.real.ext = sext %a0.real to + %a1.real.ext = sext %a1.real to + %b0.deinterleaved = tail call { , } @llvm.vector.deinterleave2.nxv32i8( %b0) + %b0.real = extractvalue { , } %b0.deinterleaved, 0 + %b0.imag = extractvalue { , } %b0.deinterleaved, 1 + %b1.deinterleaved = tail call { , } @llvm.vector.deinterleave2.nxv32i8( %b1) + %b1.real = extractvalue { , } %b1.deinterleaved, 0 + %b1.imag = extractvalue { , } %b1.deinterleaved, 1 + %b0.real.ext = sext %b0.real to + %b1.real.ext = sext %b1.real to + %18 = mul nsw %b0.real.ext, %a0.real.ext + %19 = mul nsw %b1.real.ext, %a1.real.ext + %a0.imag.ext = sext %a0.imag to + %a1.imag.ext = sext %a1.imag to + %b0.imag.ext = sext %b0.imag to + %b1.imag.ext = sext %b1.imag to + %24 = mul nsw %b0.imag.ext, %a0.imag.ext + %25 = mul nsw %b1.imag.ext, %a1.imag.ext + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %18) + %partial.reduce32 = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi25, %19) + %26 = sub nsw zeroinitializer, %24 + %27 = sub nsw zeroinitializer, %25 + %partial.reduce33 = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %partial.reduce, %26) + %partial.reduce34 = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %partial.reduce32, %27) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %bin.rdx = add %partial.reduce34, %partial.reduce33 + %29 = tail call i32 @llvm.vector.reduce.add.nxv4i32( %bin.rdx) + ret i32 %29 +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) + +declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64()