Skip to content

Commit 0744053

Browse files
committed
Address review
1 parent eab11c8 commit 0744053

File tree

3 files changed

+40
-27
lines changed

3 files changed

+40
-27
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2192,6 +2192,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
21922192
}
21932193
case Intrinsic::loop_dependence_raw_mask:
21942194
case Intrinsic::loop_dependence_war_mask: {
2195+
// Compute the cost of the expanded version of these intrinsics:
2196+
// ; Figure out if there's overlap between the pointers.
2197+
// diff = (ptrB - ptrA) / eltSize ; read-after-write will use the
2198+
// absolute difference
2199+
// cmp = diff <= 0 ; read-after-write will check for equality
2200+
// with 0
2201+
// ; Create a mask with each lane < diff active. This is essentiallly
2202+
// an active lane mask between 0 and diff.
2203+
// diff_splat = splat diff to <Y x i64>
2204+
// steps = stepvector <Y x i64>
2205+
// diff_mask = steps <= diff_splat
2206+
// ; OR that diff mask with the comparison result, so that each lane is
2207+
// active if it's less than diff or there was no overlap in the
2208+
// first place. Otherwise the lane is inactive.
2209+
// cmp_splat = splat cmp to <Y x i1>
2210+
// result = or cmp_splat diff_mask
21952211
InstructionCost Cost = 0;
21962212
Type *PtrTy = ICA.getArgTypes()[0];
21972213
bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1816,6 +1816,22 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
18161816
}
18171817

18181818
SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) {
1819+
// Expand these intrinsics:
1820+
// ; Figure out if there's overlap between the pointers.
1821+
// diff = (ptrB - ptrA) / eltSize ; read-after-write will use the absolute
1822+
// difference
1823+
// cmp = diff <= 0 ; read-after-write will check for equality
1824+
// with 0
1825+
// ; Create a mask with each lane < diff active. This is essentiallly an
1826+
// active lane mask between 0 and diff.
1827+
// diff_splat = splat diff to <Y x i64>
1828+
// steps = stepvector <Y x i64>
1829+
// diff_mask = steps <= diff_splat
1830+
// ; OR that diff mask with the comparison result, so that each lane is
1831+
// active if it's less than diff or there was no overlap in the
1832+
// first place. Otherwise the lane is inactive.
1833+
// cmp_splat = splat cmp to <Y x i1>
1834+
// result = or cmp_splat diff_mask
18191835
SDLoc DL(N);
18201836
SDValue SourceValue = N->getOperand(0);
18211837
SDValue SinkValue = N->getOperand(1);

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,37 +1034,18 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
10341034
}
10351035
case Intrinsic::loop_dependence_raw_mask:
10361036
case Intrinsic::loop_dependence_war_mask: {
1037-
auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
1037+
unsigned EltSizeInBytes =
1038+
cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
10381039
EVT VecVT = getTLI()->getValueType(DL, RetTy);
10391040
// An invalid element size and return type combination must be expanded.
1040-
bool MustBeExpanded = false;
1041-
switch (EltSize->getSExtValue()) {
1042-
case 1:
1043-
if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
1044-
MustBeExpanded = true;
1045-
break;
1046-
case 2:
1047-
if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
1048-
MustBeExpanded = true;
1049-
break;
1050-
case 4:
1051-
if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
1052-
MustBeExpanded = true;
1053-
break;
1054-
case 8:
1055-
if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
1056-
MustBeExpanded = true;
1057-
break;
1058-
default:
1059-
MustBeExpanded = true;
1060-
// Other element sizes are incompatible with whilewr/rw, so expand instead
1061-
break;
1062-
}
1041+
bool MustBeExpanded =
1042+
VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes) ||
1043+
!isPowerOf2_32(EltSizeInBytes) || EltSizeInBytes > 8;
10631044

10641045
// The whilewr/rw instructions require SVE2 or SME
1065-
if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
1066-
break;
1067-
return 1;
1046+
if (!MustBeExpanded && (ST->hasSVE2() || ST->hasSME()))
1047+
return 1;
1048+
break;
10681049
}
10691050
case Intrinsic::experimental_vector_extract_last_active:
10701051
if (ST->isSVEorStreamingSVEAvailable()) {

0 commit comments

Comments
 (0)