From 19638909d5f326aec2e51f84fae652ab6a0dc933 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sun, 25 May 2025 13:44:46 -0700 Subject: [PATCH] [RISCV][TTI] Adjust costing in getPartialReductionCost for zvqdotq Two changes: 1) Handle fixed vector cases now that 77a3f8 has landed. 2) Fix a mistake in the original costing - the VF passed in is the input VF, not the output VF. Given that we should be costing the accumulator type with VF/4. Note that (2) does not cause any visibile test differences as the vectorizer (outside of maximize-bandwidth mode) does not consider wide enough VF for the costing difference to matter. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 4 +- .../RISCV/partial-reduce-dot-product.ll | 224 ++++++++++++++++++ 2 files changed, 226 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index d54ad63404578..860d787111ce4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -306,10 +306,10 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost( Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul || InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) || OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) || - !VF.isKnownMultipleOf(4) || !VF.isScalable()) + !VF.isKnownMultipleOf(4)) return InstructionCost::getInvalid(); - Type *Tp = VectorType::get(AccumType, VF); + Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4)); std::pair LT = getTypeLegalizationCost(Tp); // Note: Asuming all vqdot* variants are equal cost // TODO: Thread CostKind through this API diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 23534143ed3a9..847c4ba0bebfc 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 ; RUN: opt -passes=loop-vectorize -mattr=+v -S < %s | FileCheck %s --check-prefixes=CHECK,V ; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ +; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V +; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ target triple = "riscv64-none-unknown-elf" @@ -79,6 +81,80 @@ define i32 @vqdot(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVQDOTQ: scalar.ph: ; +; FIXED-V-LABEL: define i32 @vqdot( +; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-V-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-V-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @vqdot( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: +; entry: br label %for.body @@ -177,6 +253,80 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 { ; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; ZVQDOTQ: scalar.ph: ; +; FIXED-V-LABEL: define i32 @vqdotu( +; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-V-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-V-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotu( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: +; entry: br label %for.body @@ -238,6 +388,43 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; +; FIXED-LABEL: define i32 @vqdotsu( +; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED: vector.ph: +; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED: vector.body: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED: middle.block: +; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; entry: br label %for.body @@ -298,6 +485,43 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; +; FIXED-LABEL: define i32 @vqdotsu2( +; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-NEXT: entry: +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED: vector.ph: +; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED: vector.body: +; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED: middle.block: +; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED: scalar.ph: +; entry: br label %for.body