From fc515fbb2b74f427fb4aa0cd7f1f9ada10ef2b4d Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 6 Mar 2025 16:18:06 +0000 Subject: [PATCH 1/3] [CostModel] Add type-based cost model for get.active.lane.mask intrinsic I recently realised that we return an invalid cost when requesting the type-based cost for the get.active.lane.mask intrinsic. I've fixed that in this patch by reusing the existing code for the non-type-based model. --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 47 +++++++++++-------- .../CostModel/AArch64/sve-intrinsics.ll | 24 +++++----- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 563953516a354..a663c350a8655 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1691,6 +1691,29 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } + InstructionCost getActiveLaneMaskCost(Type *RetTy, Type *ArgTy, + TTI::TargetCostKind CostKind) { + EVT ResVT = getTLI()->getValueType(DL, RetTy, true); + EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true); + + // If we're not expanding the intrinsic then we assume this is cheap + // to implement. + if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT)) + return getTypeLegalizationCost(RetTy).first; + + // Create the expanded types that will be used to calculate the uadd_sat + // operation. + Type *ExpRetTy = + VectorType::get(ArgTy, cast(RetTy)->getElementCount()); + IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, + FastMathFlags()); + InstructionCost Cost = + thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, + CmpInst::ICMP_ULT, CostKind); + return Cost; + } + /// Get intrinsic cost based on arguments. InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { @@ -1987,25 +2010,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } case Intrinsic::get_active_lane_mask: { - EVT ResVT = getTLI()->getValueType(DL, RetTy, true); - EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); - - // If we're not expanding the intrinsic then we assume this is cheap - // to implement. - if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { - return getTypeLegalizationCost(RetTy).first; - } - - // Create the expanded types that will be used to calculate the uadd_sat - // operation. - Type *ExpRetTy = VectorType::get( - ICA.getArgTypes()[0], cast(RetTy)->getElementCount()); - IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); - InstructionCost Cost = - thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, - CmpInst::ICMP_ULT, CostKind); - return Cost; + return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0], + CostKind); } case Intrinsic::experimental_cttz_elts: { EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); @@ -2394,6 +2400,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); return Cost; } + case Intrinsic::get_active_lane_mask: + return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0], + CostKind); case Intrinsic::abs: ISD = ISD::ABS; break; diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index 0bf776b5c97e3..265ff89b36050 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -958,16 +958,16 @@ define void @get_lane_mask() #0 { ; CHECK-VSCALE-2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask' -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv8i1_i64 = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv4i1_i64 = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv2i1_i64 = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv16i1_i32 = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv8i1_i32 = call @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv4i1_i32 = call @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef) @@ -976,8 +976,8 @@ define void @get_lane_mask() #0 { ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef) From 3097f5491ad81c9d6f43990f67796173ea3f6b5a Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 7 Mar 2025 10:20:00 +0000 Subject: [PATCH 2/3] Address review comment --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 53 ++++++++++-------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index a663c350a8655..3e322e95628a5 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1691,29 +1691,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } - InstructionCost getActiveLaneMaskCost(Type *RetTy, Type *ArgTy, - TTI::TargetCostKind CostKind) { - EVT ResVT = getTLI()->getValueType(DL, RetTy, true); - EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true); - - // If we're not expanding the intrinsic then we assume this is cheap - // to implement. - if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT)) - return getTypeLegalizationCost(RetTy).first; - - // Create the expanded types that will be used to calculate the uadd_sat - // operation. - Type *ExpRetTy = - VectorType::get(ArgTy, cast(RetTy)->getElementCount()); - IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, - FastMathFlags()); - InstructionCost Cost = - thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, - CmpInst::ICMP_ULT, CostKind); - return Cost; - } - /// Get intrinsic cost based on arguments. InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { @@ -2009,10 +1986,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } return Cost; } - case Intrinsic::get_active_lane_mask: { - return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0], - CostKind); - } case Intrinsic::experimental_cttz_elts: { EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); @@ -2060,6 +2033,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return Cost; } + case Intrinsic::get_active_lane_mask: case Intrinsic::experimental_vector_match: return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind); case Intrinsic::modf: @@ -2400,9 +2374,28 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind); return Cost; } - case Intrinsic::get_active_lane_mask: - return thisT()->getActiveLaneMaskCost(RetTy, ICA.getArgTypes()[0], - CostKind); + case Intrinsic::get_active_lane_mask: { + Type *ArgTy = ICA.getArgTypes()[0]; + EVT ResVT = getTLI()->getValueType(DL, RetTy, true); + EVT ArgVT = getTLI()->getValueType(DL, ArgTy, true); + + // If we're not expanding the intrinsic then we assume this is cheap + // to implement. + if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgVT)) + return getTypeLegalizationCost(RetTy).first; + + // Create the expanded types that will be used to calculate the uadd_sat + // operation. + Type *ExpRetTy = + VectorType::get(ArgTy, cast(RetTy)->getElementCount()); + IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, + FastMathFlags()); + InstructionCost Cost = + thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, + CmpInst::ICMP_ULT, CostKind); + return Cost; + } case Intrinsic::abs: ISD = ISD::ABS; break; From 7f0015fffd7b29512706f4a00ba1663a4fc69092 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 7 Mar 2025 14:40:05 +0000 Subject: [PATCH 3/3] Address review comment --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 3e322e95628a5..bde19de0c87f5 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2388,8 +2388,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // operation. Type *ExpRetTy = VectorType::get(ArgTy, cast(RetTy)->getElementCount()); - IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, - FastMathFlags()); + IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); InstructionCost Cost = thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,