diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3fba7e853eafb..3042251cf754d 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -666,6 +666,16 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return LT.first; break; } + case Intrinsic::fma: + case Intrinsic::fmuladd: { + // Given a fma or fmuladd, cost it the same as a fmul instruction which are + // usually the same for costs. TODO: Add fp16 and bf16 expansion costs. + Type *EltTy = RetTy->getScalarType(); + if (EltTy->isFloatTy() || EltTy->isDoubleTy() || + (EltTy->isHalfTy() && ST->hasFullFP16())) + return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind); + break; + } case Intrinsic::stepvector: { InstructionCost Cost = 1; // Cost of the `index' instruction auto LT = getTypeLegalizationCost(RetTy); diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll index 0a154d09c36ba..c208d03ff94b7 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll @@ -663,13 +663,13 @@ define void @fcopysign_fp16() { define void @fma() { ; CHECK-LABEL: 'fma' -; CHECK-NEXT: Cost Model: Found costs of 1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %F128 = call fp128 @llvm.fma.f128(fp128 undef, fp128 undef, fp128 undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V2F128 = call <2 x fp128> @llvm.fma.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void @@ -698,10 +698,10 @@ define void @fma_fp16() { ; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-FP16-LABEL: 'fma_fp16' -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) @@ -713,13 +713,13 @@ define void @fma_fp16() { define void @fmuladd() { ; CHECK-LABEL: 'fmuladd' -; CHECK-NEXT: Cost Model: Found costs of 1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F32 = call float @llvm.fmuladd.f32(float undef, float undef, float undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F64 = call double @llvm.fmuladd.f64(double undef, double undef, double undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:2 Lat:6 SizeLat:2 for: %F128 = call fp128 @llvm.fmuladd.f128(fp128 undef, fp128 undef, fp128 undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:2 Lat:6 SizeLat:2 for: %V2F128 = call <2 x fp128> @llvm.fmuladd.v2f128(<2 x fp128> undef, <2 x fp128> undef, <2 x fp128> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void @@ -748,10 +748,10 @@ define void @fmuladd_fp16() { ; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-FP16-LABEL: 'fmuladd_fp16' -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found costs of 4 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %F16 = call half @llvm.fmuladd.f16(half undef, half undef, half undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %F16 = call half @llvm.fmuladd.f32(half undef, half undef, half undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll index f7ebd406d230a..1c40354892191 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll @@ -167,14 +167,14 @@ define void @frem() { define void @fma() { ; CHECK-LABEL: 'fma' -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4F16 = call @llvm.fma.nxv4f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8F16 = call @llvm.fma.nxv8f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = call @llvm.fma.nxv16f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2F32 = call @llvm.fma.nxv2f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4F32 = call @llvm.fma.nxv4f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = call @llvm.fma.nxv8f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2F64 = call @llvm.fma.nxv2f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = call @llvm.fma.nxv4f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call @llvm.fma.nxv4f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call @llvm.fma.nxv8f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call @llvm.fma.nxv16f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call @llvm.fma.nxv2f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call @llvm.fma.nxv4f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call @llvm.fma.nxv8f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call @llvm.fma.nxv2f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call @llvm.fma.nxv4f64( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = call @llvm.fma.v4f16( undef, undef, undef) @@ -193,14 +193,14 @@ define void @fma() { define void @fmuladd() { ; CHECK-LABEL: 'fmuladd' -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4F16 = call @llvm.fmuladd.nxv4f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8F16 = call @llvm.fmuladd.nxv8f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = call @llvm.fmuladd.nxv16f16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2F32 = call @llvm.fmuladd.nxv2f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4F32 = call @llvm.fmuladd.nxv4f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = call @llvm.fmuladd.nxv8f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2F64 = call @llvm.fmuladd.nxv2f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = call @llvm.fmuladd.nxv4f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call @llvm.fmuladd.nxv4f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call @llvm.fmuladd.nxv8f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call @llvm.fmuladd.nxv16f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call @llvm.fmuladd.nxv2f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call @llvm.fmuladd.nxv4f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call @llvm.fmuladd.nxv8f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call @llvm.fmuladd.nxv2f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call @llvm.fmuladd.nxv4f64( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = call @llvm.fmuladd.v4f16( undef, undef, undef) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll index d9710328d6048..f3542f63a4273 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll @@ -4,73 +4,102 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr %end1, double %x, i64 %n) { ; CHECK-LABEL: define double @fp128_fmuladd_reduction( ; CHECK-SAME: ptr [[START0:%.*]], ptr [[START1:%.*]], ptr [[END0:%.*]], ptr [[END1:%.*]], double [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[N_VEC]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP29:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 48 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP4]] -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP5]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi double [ [[X]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr fp128, ptr [[TMP1]], i32 6 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x fp128>, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x fp128>, ptr [[TMP24]], align 16 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x fp128>, ptr [[TMP4]], align 16 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x fp128>, ptr [[TMP5]], align 16 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP3]], i32 6 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP28]], align 16 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP35]], align 16 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x double>, ptr [[TMP36]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD]] to <2 x double> +; CHECK-NEXT: [[TMP11:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD3]] to <2 x double> +; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD4]] to <2 x double> +; CHECK-NEXT: [[TMP13:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD5]] to <2 x double> +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP10]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP11]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP12]], [[WIDE_LOAD8]] +; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP13]], [[WIDE_LOAD9]] +; CHECK-NEXT: [[TMP18:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP14]]) +; CHECK-NEXT: [[TMP19:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP18]], <2 x double> [[TMP15]]) +; CHECK-NEXT: [[TMP20:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP19]], <2 x double> [[TMP16]]) +; CHECK-NEXT: [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[TMP20]], <2 x double> [[TMP17]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 16 ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]] -; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = mul i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX4]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX4]], 16 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX4]], 24 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START1]], i64 [[OFFSET_IDX4]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 2 +; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]] +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC11]], 16 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC11]], 8 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP8]] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[VEC_EPILOG_PH]] ], [ [[TMP33:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX12]], 16 +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[START0]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[INDEX12]], 8 ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load fp128, ptr [[NEXT_GEP]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load fp128, ptr [[NEXT_GEP1]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load fp128, ptr [[NEXT_GEP2]], align 16 -; CHECK-NEXT: [[TMP13:%.*]] = load fp128, ptr [[NEXT_GEP3]], align 16 -; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[NEXT_GEP5]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[NEXT_GEP6]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[NEXT_GEP7]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[NEXT_GEP8]], align 16 -; CHECK-NEXT: [[TMP18:%.*]] = fptrunc fp128 [[TMP10]] to double -; CHECK-NEXT: [[TMP19:%.*]] = fptrunc fp128 [[TMP11]] to double -; CHECK-NEXT: [[TMP20:%.*]] = fptrunc fp128 [[TMP12]] to double -; CHECK-NEXT: [[TMP21:%.*]] = fptrunc fp128 [[TMP13]] to double -; CHECK-NEXT: [[TMP22:%.*]] = fmul double [[TMP18]], [[TMP14]] -; CHECK-NEXT: [[TMP23:%.*]] = fmul double [[TMP19]], [[TMP15]] -; CHECK-NEXT: [[TMP24:%.*]] = fmul double [[TMP20]], [[TMP16]] -; CHECK-NEXT: [[TMP25:%.*]] = fmul double [[TMP21]], [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = fadd double [[VEC_PHI]], [[TMP22]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd double [[TMP26]], [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd double [[TMP27]], [[TMP24]] -; CHECK-NEXT: [[TMP29]] = fadd double [[TMP28]], [[TMP25]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[START0]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[START1]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP29]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[ENTRY]] ] +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x fp128>, ptr [[NEXT_GEP14]], align 16 +; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x double>, ptr [[NEXT_GEP8]], align 16 +; CHECK-NEXT: [[TMP31:%.*]] = fptrunc <2 x fp128> [[WIDE_LOAD17]] to <2 x double> +; CHECK-NEXT: [[TMP32:%.*]] = fmul <2 x double> [[TMP31]], [[WIDE_LOAD18]] +; CHECK-NEXT: [[TMP33]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI13]], <2 x double> [[TMP32]]) +; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX12]], 2 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[N]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START0]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL21:%.*]] = phi ptr [ [[NEXT_GEP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[NEXT_GEP6]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START1]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi double [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP21]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[X]], %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL9]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL10]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[PTR0:%.*]] = phi ptr [ [[PTR0_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[PTR1:%.*]] = phi ptr [ [[PTR1_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL22]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[RED:%.*]] = phi double [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX23]], %[[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[PTR0_NEXT]] = getelementptr i8, ptr [[PTR0]], i64 16 ; CHECK-NEXT: [[PTR1_NEXT]] = getelementptr i8, ptr [[PTR1]], i64 8 ; CHECK-NEXT: [[LOAD0:%.*]] = load fp128, ptr [[PTR0]], align 16 @@ -79,9 +108,9 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr ; CHECK-NEXT: [[RED_NEXT]] = tail call double @llvm.fmuladd.f64(double [[TRUNC]], double [[LOAD1]], double [[RED]]) ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP29]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[LCSSA:%.*]] = phi double [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[TMP33]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret double [[LCSSA]] ; entry: @@ -110,5 +139,6 @@ exit: ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll index f6f2e39594dd8..64fc573e660bd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll @@ -1310,7 +1310,8 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) { ; ; LIBMVEC-SVE-LABEL: define void @fma_f64 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] { -; LIBMVEC-SVE: [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]]) +; LIBMVEC-SVE: [[TMP8:%.*]] = call @llvm.fma.nxv2f64( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]]) +; LIBMVEC-SVE: [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[IN:%.*]], double [[IN]], double [[IN]]) ; ; SLEEF-NEON-LABEL: define void @fma_f64 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] { @@ -1357,7 +1358,8 @@ define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) { ; ; LIBMVEC-SVE-LABEL: define void @fma_f32 ; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] { -; LIBMVEC-SVE: [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]]) +; LIBMVEC-SVE: [[TMP8:%.*]] = call @llvm.fma.nxv4f32( [[WIDE_LOAD:%.*]], [[WIDE_LOAD]], [[WIDE_LOAD]]) +; LIBMVEC-SVE: [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[IN:%.*]], float [[IN]], float [[IN]]) ; ; SLEEF-NEON-LABEL: define void @fma_f32 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index cca58d8d66f04..26ce0fc6e6a3b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -12,8 +12,7 @@ define void @test() { ; CHECK: [[BB63]]: ; CHECK-NEXT: br label %[[BB64]] ; CHECK: [[BB64]]: -; CHECK-NEXT: [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] -; CHECK-NEXT: [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ] ; CHECK-NEXT: [[I66:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I67:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I68:%.*]] = load float, ptr poison, align 8 @@ -25,122 +24,57 @@ define void @test() { ; CHECK-NEXT: [[I74:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[I75:%.*]] = load float, ptr poison, align 16 ; CHECK-NEXT: [[I76:%.*]] = load float, ptr poison, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]] ; CHECK: [[BB77]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I70]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[I67]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[I69]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[I66]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[I68]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[I66]], i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[I67]], i32 6 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I69]], i32 7 +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> [[TMP0]], <16 x i32> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: -; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[TMP46:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP39:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[TMP53:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[TMP40:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP0]], %[[BB77]] ], [ [[TMP38:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB77]] ], [ [[TMP35:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP3]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x float> [ [[TMP4]], %[[BB77]] ], [ [[TMP29:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[TMP8]], poison -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP7]], poison -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[TMP6]], poison -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[TMP5]], poison -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP8]], poison -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP7]], poison -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x float> [[TMP6]], poison -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[TMP5]], poison -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP19]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP9]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[TMP21]], poison -; CHECK-NEXT: [[TMP23:%.*]] = fadd fast <2 x float> [[TMP16]], [[TMP10]] -; CHECK-NEXT: [[TMP24:%.*]] = fadd fast <2 x float> [[TMP23]], poison -; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x float> [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x float> [[TMP25]], poison -; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x float> [[TMP20]], [[TMP12]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x float> [[TMP27]], poison -; CHECK-NEXT: [[TMP29]] = fadd fast <2 x float> [[TMP22]], poison -; CHECK-NEXT: [[TMP30]] = extractelement <2 x float> [[TMP29]], i32 1 -; CHECK-NEXT: [[TMP31]] = extractelement <2 x float> [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP32]] = fadd fast <2 x float> [[TMP24]], poison -; CHECK-NEXT: [[TMP53]] = extractelement <2 x float> [[TMP32]], i32 1 -; CHECK-NEXT: [[TMP46]] = extractelement <2 x float> [[TMP32]], i32 0 -; CHECK-NEXT: [[TMP35]] = fadd fast <2 x float> [[TMP26]], poison -; CHECK-NEXT: [[TMP36]] = extractelement <2 x float> [[TMP35]], i32 1 -; CHECK-NEXT: [[TMP37]] = extractelement <2 x float> [[TMP35]], i32 0 -; CHECK-NEXT: [[TMP38]] = fadd fast <2 x float> [[TMP28]], poison -; CHECK-NEXT: [[TMP39]] = extractelement <2 x float> [[TMP38]], i32 1 -; CHECK-NEXT: [[TMP40]] = extractelement <2 x float> [[TMP38]], i32 0 -; CHECK-NEXT: [[I135:%.*]] = fmul fast float [[I85]], [[I65]] -; CHECK-NEXT: [[I128:%.*]] = fmul fast float [[I80]], [[I65]] -; CHECK-NEXT: [[I129:%.*]] = fmul fast float [[I81]], [[I65]] -; CHECK-NEXT: [[I130:%.*]] = fmul fast float [[I82]], [[I65]] -; CHECK-NEXT: [[I133:%.*]] = fmul fast float [[I84]], [[I77]] -; CHECK-NEXT: [[I136:%.*]] = fmul fast float [[I127]], [[I77]] -; CHECK-NEXT: [[I138:%.*]] = fmul fast float [[I131]], [[I77]] -; CHECK-NEXT: [[I137:%.*]] = fmul fast float [[I86]], [[I77]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = fadd fast float poison, [[I133]] -; CHECK-NEXT: [[OP_RDX15:%.*]] = fadd fast float [[OP_RDX14]], [[I135]] -; CHECK-NEXT: [[OP_RDX12:%.*]] = fadd fast float poison, [[I136]] -; CHECK-NEXT: [[OP_RDX13:%.*]] = fadd fast float [[OP_RDX12]], [[I128]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = fadd fast float poison, [[I138]] -; CHECK-NEXT: [[OP_RDX11:%.*]] = fadd fast float [[OP_RDX10]], [[I129]] -; CHECK-NEXT: [[OP_RDX8:%.*]] = fadd fast float poison, [[I137]] -; CHECK-NEXT: [[OP_RDX9:%.*]] = fadd fast float [[OP_RDX8]], [[I130]] -; CHECK-NEXT: [[TMP41:%.*]] = fmul fast <2 x float> [[TMP8]], poison -; CHECK-NEXT: [[TMP42:%.*]] = fmul fast <2 x float> [[TMP7]], poison -; CHECK-NEXT: [[TMP43:%.*]] = fmul fast <2 x float> [[TMP6]], poison -; CHECK-NEXT: [[TMP44:%.*]] = fmul fast <2 x float> [[TMP5]], poison -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[TMP41]], i32 1 -; CHECK-NEXT: [[I157:%.*]] = fadd fast float poison, [[TMP45]] -; CHECK-NEXT: [[I150:%.*]] = extractelement <2 x float> [[TMP41]], i32 0 -; CHECK-NEXT: [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x float> [[TMP42]], i32 1 -; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float poison, [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x float> [[TMP42]], i32 0 -; CHECK-NEXT: [[OP_RDX5:%.*]] = fadd fast float [[OP_RDX4]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x float> [[TMP43]], i32 1 -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float poison, [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x float> [[TMP43]], i32 0 -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x float> [[TMP44]], i32 0 -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float poison, [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <2 x float> [[TMP44]], i32 1 -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP52]] +; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP14]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi <8 x float> [ [[TMP19]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <16 x float> [[TMP25]], [[TMP0]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison +; CHECK-NEXT: [[TMP30:%.*]] = fadd fast <16 x float> [[TMP29]], poison +; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP32]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: -; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[OP_RDX1]], %[[BB78]] ] -; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX3]], %[[BB78]] ] -; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX5]], %[[BB78]] ] -; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[TMP60]], %[[BB78]] ] -; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[OP_RDX9]], %[[BB78]] ] -; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[OP_RDX11]], %[[BB78]] ] -; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[OP_RDX13]], %[[BB78]] ] -; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[OP_RDX15]], %[[BB78]] ] -; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[TMP40]], %[[BB78]] ] -; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP36]], %[[BB78]] ] -; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[TMP53]], %[[BB78]] ] -; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP30]], %[[BB78]] ] -; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP39]], %[[BB78]] ] -; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[TMP37]], %[[BB78]] ] -; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[TMP46]], %[[BB78]] ] -; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[TMP31]], %[[BB78]] ] +; CHECK-NEXT: [[TMP35:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP30]], %[[BB78]] ] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x float> [[TMP35]], i32 14 ; CHECK-NEXT: store float [[TMP33]], ptr poison, align 1 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x float> [[TMP35]], i32 13 ; CHECK-NEXT: store float [[TMP34]], ptr poison, align 1 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x float> [[TMP35]], i32 15 ; CHECK-NEXT: br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]] ; CHECK: [[BB184]]: ; CHECK-NEXT: br label %[[BB185:.*]] ; CHECK: [[BB185]]: ; CHECK-NEXT: br i1 poison, label %[[BB185]], label %[[BB186]] ; CHECK: [[BB186]]: -; CHECK-NEXT: [[I187:%.*]] = phi nsz float [ [[I178]], %[[BB167]] ], [ poison, %[[BB185]] ] +; CHECK-NEXT: [[I187:%.*]] = phi nsz float [ [[TMP36]], %[[BB167]] ], [ poison, %[[BB185]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 2b591a2165534..683b92752c702 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -402,19 +402,32 @@ entry: } define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) { -; CHECK-LABEL: define void @reuse_shuffle_indices_cost_crash_2( -; CHECK-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 -; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00) -; CHECK-NEXT: store float [[TMP1]], ptr [[BEZT]], align 4 -; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> , float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) -; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: define void @reuse_shuffle_indices_cost_crash_2( +; NON-POW2-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[FNEG]], i32 0 +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> , float [[TMP0]], i32 0 +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP3]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP2]], <3 x float> [[TMP4]], <3 x float> zeroinitializer) +; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[BEZT]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: define void @reuse_shuffle_indices_cost_crash_2( +; POW2-ONLY-SAME: ptr [[BEZT:%.*]], float [[TMP0:%.*]]) { +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00 +; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0 +; POW2-ONLY-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer +; POW2-ONLY-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer) +; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[BEZT]], align 4 +; POW2-ONLY-NEXT: [[TMP6:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00) +; POW2-ONLY-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2 +; POW2-ONLY-NEXT: store float [[TMP6]], ptr [[ARRAYIDX8_I831]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %fneg = fmul float %0, 0.000000e+00 diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll index b5d74f0b91ab8..eefc99feebb95 100644 --- a/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll +++ b/llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll @@ -24,21 +24,17 @@ define <4 x double> @test(ptr %p2, double %i1754, double %i1781, double %i1778) ; AARCH86-NEXT: entry: ; AARCH86-NEXT: [[I1771:%.*]] = getelementptr inbounds double, ptr [[P2:%.*]], i64 54 ; AARCH86-NEXT: [[I1772:%.*]] = load double, ptr [[I1771]], align 8 -; AARCH86-NEXT: [[I1773:%.*]] = fmul fast double [[I1772]], [[I1754:%.*]] -; AARCH86-NEXT: [[I1782:%.*]] = fmul fast double [[I1754]], [[I1754]] -; AARCH86-NEXT: [[I1783:%.*]] = fadd fast double [[I1782]], 1.000000e+00 -; AARCH86-NEXT: [[I1787:%.*]] = fmul fast double [[I1778:%.*]], [[I1754]] -; AARCH86-NEXT: [[I1788:%.*]] = fadd fast double [[I1787]], 1.000000e+00 -; AARCH86-NEXT: [[I1792:%.*]] = fmul fast double [[I1754]], [[I1781:%.*]] -; AARCH86-NEXT: [[I1793:%.*]] = fadd fast double [[I1792]], 1.000000e+00 ; AARCH86-NEXT: [[I1795:%.*]] = getelementptr inbounds double, ptr [[P2]], i64 55 ; AARCH86-NEXT: [[I1796:%.*]] = load double, ptr [[I1795]], align 8 -; AARCH86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781]] -; AARCH86-NEXT: [[TMP4:%.*]] = fadd fast double [[I1773]], [[I1797]] -; AARCH86-NEXT: [[I1976:%.*]] = insertelement <4 x double> zeroinitializer, double [[I1783]], i64 0 -; AARCH86-NEXT: [[I1982:%.*]] = insertelement <4 x double> [[I1976]], double [[I1788]], i64 1 -; AARCH86-NEXT: [[I1988:%.*]] = insertelement <4 x double> [[I1982]], double [[I1793]], i64 2 -; AARCH86-NEXT: [[I1994:%.*]] = insertelement <4 x double> [[I1988]], double [[TMP4]], i64 3 +; AARCH86-NEXT: [[I1797:%.*]] = fmul fast double [[I1796]], [[I1781:%.*]] +; AARCH86-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[I1754:%.*]], i32 0 +; AARCH86-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[I1778:%.*]], i32 1 +; AARCH86-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[I1781]], i32 2 +; AARCH86-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[I1772]], i32 3 +; AARCH86-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> zeroinitializer +; AARCH86-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP4]] +; AARCH86-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[I1797]], i32 3 +; AARCH86-NEXT: [[I1994:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP6]] ; AARCH86-NEXT: ret <4 x double> [[I1994]] ; entry: