Skip to content

Commit f9e5a7c

Browse files
committed
[Intrinsics][AArch64] Add intrinsic to mask off aliasing vector lanes
It can be unsafe to load a vector from an address and write a vector to an address if those two addresses have overlapping lanes within a vectorised loop iteration. This PR adds an intrinsic designed to create a mask with lanes disabled if they overlap between the two pointer arguments, so that only safe lanes are loaded, operated on and stored. Along with the two pointer parameters, the intrinsic also takes an immediate that represents the size in bytes of the vector element types, as well as an immediate i1 that is true if there is a write after-read-hazard or false if there is a read-after-write hazard. This will be used by llvm#100579 and replaces the existing lowering for whilewr since that isn't needed now we have the intrinsic.
1 parent 263e458 commit f9e5a7c

File tree

10 files changed

+861
-10
lines changed

10 files changed

+861
-10
lines changed

llvm/docs/LangRef.rst

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24104,6 +24104,90 @@ Examples:
2410424104
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
2410524105
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
2410624106

24107+
.. _int_experimental_get_alias_lane_mask:
24108+
24109+
'``llvm.experimental.get.alias.lane.mask.*``' Intrinsics
24110+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24111+
24112+
Syntax:
24113+
"""""""
24114+
This is an overloaded intrinsic.
24115+
24116+
::
24117+
24118+
declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
24119+
declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64.i64(i64 %ptrA, i64 %ptrB, i64 immarg %elementSize, i1 immarg %writeAfterRead)
24120+
declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
24121+
declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
24122+
24123+
24124+
Overview:
24125+
"""""""""
24126+
24127+
Create a mask representing lanes that do or not overlap between two pointers
24128+
across one vector loop iteration.
24129+
24130+
24131+
Arguments:
24132+
""""""""""
24133+
24134+
The first two arguments have the same scalar integer type.
24135+
The final two are immediates and the result is a vector with the i1 element type.
24136+
24137+
Semantics:
24138+
""""""""""
24139+
24140+
The intrinsic will return poison if ``%ptrA`` and ``%ptrB`` are within
24141+
VF * ``%elementSize`` of each other and ``%ptrA`` + VF * ``%elementSize`` wraps.
24142+
In other cases when ``%writeAfterRead`` is true, the
24143+
'``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically
24144+
equivalent to:
24145+
24146+
::
24147+
24148+
%diff = (%ptrB - %ptrA) / %elementSize
24149+
%m[i] = (icmp ult i, %diff) || (%diff <= 0)
24150+
24151+
When the return value is not poison and ``%writeAfterRead`` is false, the
24152+
'``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically
24153+
equivalent to:
24154+
24155+
::
24156+
24157+
%diff = abs(%ptrB - %ptrA) / %elementSize
24158+
%m[i] = (icmp ult i, %diff) || (%diff == 0)
24159+
24160+
where ``%m`` is a vector (mask) of active/inactive lanes with its elements
24161+
indexed by ``i``, and ``%ptrA``, ``%ptrB`` are the two i64 arguments to
24162+
``llvm.experimental.get.alias.lane.mask.*`` and ``%elementSize`` is the first
24163+
immediate argument. The ``%writeAfterRead`` argument is expected to be true if
24164+
``%ptrB`` is stored to after ``%ptrA`` is read from.
24165+
The above is equivalent to:
24166+
24167+
::
24168+
24169+
%m = @llvm.experimental.get.alias.lane.mask(%ptrA, %ptrB, %elementSize, %writeAfterRead)
24170+
24171+
This can, for example, be emitted by the loop vectorizer in which case
24172+
``%ptrA`` is a pointer that is read from within the loop, and ``%ptrB`` is a
24173+
pointer that is stored to within the loop.
24174+
If the difference between these pointers is less than the vector factor, then
24175+
they overlap (alias) within a loop iteration.
24176+
An example is if ``%ptrA`` is 20 and ``%ptrB`` is 23 with a vector factor of 8,
24177+
then lanes 3, 4, 5, 6 and 7 of the vector loaded from ``%ptrA``
24178+
share addresses with lanes 0, 1, 2, 3, 4 and 5 from the vector stored to at
24179+
``%ptrB``.
24180+
24181+
24182+
Examples:
24183+
"""""""""
24184+
24185+
.. code-block:: llvm
24186+
24187+
%alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64.i32(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
24188+
%vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptrA, i32 4, <4 x i1> %alias.lane.mask, <4 x i32> poison)
24189+
[...]
24190+
call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, <4 x i32>* %ptrB, i32 4, <4 x i1> %alias.lane.mask)
2410724191

2410824192
.. _int_experimental_vp_splice:
2410924193

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,13 @@ class LLVM_ABI TargetLoweringBase {
482482
return true;
483483
}
484484

485+
/// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should
486+
/// be expanded using generic code in SelectionDAGBuilder.
487+
virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
488+
unsigned EltSize) const {
489+
return true;
490+
}
491+
485492
virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
486493
bool IsScalable) const {
487494
return true;

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2420,6 +2420,11 @@ let IntrProperties = [IntrNoMem, ImmArg<ArgIndex<1>>] in {
24202420
llvm_i32_ty]>;
24212421
}
24222422

2423+
def int_experimental_get_alias_lane_mask:
2424+
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
2425+
[llvm_anyint_ty, LLVMMatchType<1>, llvm_anyint_ty, llvm_i1_ty],
2426+
[IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
2427+
24232428
def int_get_active_lane_mask:
24242429
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
24252430
[llvm_anyint_ty, LLVMMatchType<1>],

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8314,6 +8314,56 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
83148314
visitVectorExtractLastActive(I, Intrinsic);
83158315
return;
83168316
}
8317+
case Intrinsic::experimental_get_alias_lane_mask: {
8318+
SDValue SourceValue = getValue(I.getOperand(0));
8319+
SDValue SinkValue = getValue(I.getOperand(1));
8320+
SDValue EltSize = getValue(I.getOperand(2));
8321+
bool IsWriteAfterRead =
8322+
cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
8323+
auto IntrinsicVT = EVT::getEVT(I.getType());
8324+
auto PtrVT = SourceValue->getValueType(0);
8325+
8326+
if (!TLI.shouldExpandGetAliasLaneMask(
8327+
IntrinsicVT, PtrVT,
8328+
cast<ConstantSDNode>(EltSize)->getSExtValue())) {
8329+
visitTargetIntrinsic(I, Intrinsic);
8330+
return;
8331+
}
8332+
8333+
SDValue Diff = DAG.getNode(ISD::SUB, sdl, PtrVT, SinkValue, SourceValue);
8334+
if (!IsWriteAfterRead)
8335+
Diff = DAG.getNode(ISD::ABS, sdl, PtrVT, Diff);
8336+
8337+
Diff = DAG.getNode(ISD::SDIV, sdl, PtrVT, Diff, EltSize);
8338+
SDValue Zero = DAG.getTargetConstant(0, sdl, PtrVT);
8339+
8340+
// If the difference is positive then some elements may alias
8341+
auto CmpVT =
8342+
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), PtrVT);
8343+
SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero,
8344+
IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
8345+
8346+
// Splat the compare result then OR it with a lane mask
8347+
SDValue Splat = DAG.getSplat(IntrinsicVT, sdl, Cmp);
8348+
8349+
SDValue DiffMask;
8350+
// Don't emit an active lane mask if the target doesn't support it
8351+
if (TLI.shouldExpandGetActiveLaneMask(IntrinsicVT, PtrVT)) {
8352+
EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
8353+
IntrinsicVT.getVectorElementCount());
8354+
SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
8355+
SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
8356+
DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep, DiffSplat,
8357+
ISD::CondCode::SETULT);
8358+
} else {
8359+
DiffMask = DAG.getNode(
8360+
ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT,
8361+
DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64),
8362+
Zero, Diff);
8363+
}
8364+
SDValue Or = DAG.getNode(ISD::OR, sdl, IntrinsicVT, DiffMask, Splat);
8365+
setValue(&I, Or);
8366+
}
83178367
}
83188368
}
83198369

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2160,6 +2160,25 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
21602160
return false;
21612161
}
21622162

2163+
bool AArch64TargetLowering::shouldExpandGetAliasLaneMask(
2164+
EVT VT, EVT PtrVT, unsigned EltSize) const {
2165+
if (!Subtarget->hasSVE2())
2166+
return true;
2167+
2168+
if (PtrVT != MVT::i64)
2169+
return true;
2170+
2171+
if (VT == MVT::v2i1 || VT == MVT::nxv2i1)
2172+
return EltSize != 8;
2173+
if (VT == MVT::v4i1 || VT == MVT::nxv4i1)
2174+
return EltSize != 4;
2175+
if (VT == MVT::v8i1 || VT == MVT::nxv8i1)
2176+
return EltSize != 2;
2177+
if (VT == MVT::v16i1 || VT == MVT::nxv16i1)
2178+
return EltSize != 1;
2179+
return true;
2180+
}
2181+
21632182
bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
21642183
const IntrinsicInst *I) const {
21652184
assert(I->getIntrinsicID() ==
@@ -5987,6 +6006,18 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
59876006
EVT PtrVT = getPointerTy(DAG.getDataLayout());
59886007
return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
59896008
}
6009+
case Intrinsic::aarch64_sve_whilewr_b:
6010+
case Intrinsic::aarch64_sve_whilewr_h:
6011+
case Intrinsic::aarch64_sve_whilewr_s:
6012+
case Intrinsic::aarch64_sve_whilewr_d:
6013+
return DAG.getNode(AArch64ISD::WHILEWR, dl, Op.getValueType(),
6014+
Op.getOperand(1), Op.getOperand(2));
6015+
case Intrinsic::aarch64_sve_whilerw_b:
6016+
case Intrinsic::aarch64_sve_whilerw_h:
6017+
case Intrinsic::aarch64_sve_whilerw_s:
6018+
case Intrinsic::aarch64_sve_whilerw_d:
6019+
return DAG.getNode(AArch64ISD::WHILERW, dl, Op.getValueType(),
6020+
Op.getOperand(1), Op.getOperand(2));
59906021
case Intrinsic::aarch64_neon_abs: {
59916022
EVT Ty = Op.getValueType();
59926023
if (Ty == MVT::i64) {
@@ -6461,6 +6492,52 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
64616492
return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
64626493
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
64636494
}
6495+
case Intrinsic::experimental_get_alias_lane_mask: {
6496+
unsigned IntrinsicID = 0;
6497+
uint64_t EltSize = Op.getOperand(3)->getAsZExtVal();
6498+
bool IsWriteAfterRead = Op.getOperand(4)->getAsZExtVal() == 1;
6499+
switch (EltSize) {
6500+
case 1:
6501+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b
6502+
: Intrinsic::aarch64_sve_whilerw_b;
6503+
break;
6504+
case 2:
6505+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h
6506+
: Intrinsic::aarch64_sve_whilerw_h;
6507+
break;
6508+
case 4:
6509+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s
6510+
: Intrinsic::aarch64_sve_whilerw_s;
6511+
break;
6512+
case 8:
6513+
IntrinsicID = IsWriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d
6514+
: Intrinsic::aarch64_sve_whilerw_d;
6515+
break;
6516+
default:
6517+
llvm_unreachable("Unexpected element size for get.alias.lane.mask");
6518+
break;
6519+
}
6520+
SDValue ID = DAG.getTargetConstant(IntrinsicID, dl, MVT::i64);
6521+
6522+
EVT VT = Op.getValueType();
6523+
if (VT.isScalableVector())
6524+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6525+
Op.getOperand(2));
6526+
6527+
// We can use the SVE whilewr/whilerw instruction to lower this
6528+
// intrinsic by creating the appropriate sequence of scalable vector
6529+
// operations and then extracting a fixed-width subvector from the scalable
6530+
// vector.
6531+
6532+
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6533+
EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6534+
6535+
SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6536+
Op.getOperand(1), Op.getOperand(2));
6537+
SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6538+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6539+
DAG.getVectorIdxConstant(0, dl));
6540+
}
64646541
case Intrinsic::aarch64_neon_saddlv:
64656542
case Intrinsic::aarch64_neon_uaddlv: {
64666543
EVT OpVT = Op.getOperand(1).getValueType();
@@ -19961,7 +20038,10 @@ static bool isPredicateCCSettingOp(SDValue N) {
1996120038
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
1996220039
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
1996320040
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19964-
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt)))
20041+
N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20042+
// get_alias_lane_mask is lowered to a whilewr/rw instruction.
20043+
N.getConstantOperandVal(0) ==
20044+
Intrinsic::experimental_get_alias_lane_mask)))
1996520045
return true;
1996620046

1996720047
return false;
@@ -28232,7 +28312,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
2823228312
DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
2823328313
return;
2823428314
}
28235-
case Intrinsic::experimental_vector_match: {
28315+
case Intrinsic::experimental_vector_match:
28316+
case Intrinsic::experimental_get_alias_lane_mask: {
2823628317
if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
2823728318
return;
2823828319

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,9 @@ class AArch64TargetLowering : public TargetLowering {
513513

514514
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
515515

516+
bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT,
517+
unsigned EltSize) const override;
518+
516519
bool
517520
shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
518521

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,11 @@ def AArch64st1q_scatter : SDNode<"AArch64ISD::SST1Q_PRED", SDT_AArch64_SCATTER_V
167167
// AArch64 SVE/SVE2 - the remaining node definitions
168168
//
169169

170+
// Alias masks
171+
def SDT_AArch64Mask : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0,i1>]>;
172+
def AArch64whilewr : SDNode<"AArch64ISD::WHILEWR", SDT_AArch64Mask>;
173+
def AArch64whilerw : SDNode<"AArch64ISD::WHILERW", SDT_AArch64Mask>;
174+
170175
// SVE CNT/INC/RDVL
171176
def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">;
172177
def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">;
@@ -4125,9 +4130,9 @@ let Predicates = [HasSVE2_or_SME] in {
41254130
defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>;
41264131

41274132
// SVE2 pointer conflict compare
4128-
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
4129-
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
4130-
} // End HasSVE2_or_SME
4133+
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", AArch64whilewr>;
4134+
defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", AArch64whilerw>;
4135+
} // End HasSVE2orSME
41314136

41324137
let Predicates = [HasSVEAES, HasNonStreamingSVE_or_SSVE_AES] in {
41334138
// SVE2 crypto destructive binary operations

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5946,16 +5946,16 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
59465946
let isWhile = 1;
59475947
}
59485948

5949-
multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
5949+
multiclass sve2_int_while_rr<bits<1> rw, string asm, SDPatternOperator op> {
59505950
def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
59515951
def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
59525952
def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
59535953
def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
59545954

5955-
def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
5956-
def : SVE_2_Op_Pat<nxv8i1, !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
5957-
def : SVE_2_Op_Pat<nxv4i1, !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
5958-
def : SVE_2_Op_Pat<nxv2i1, !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
5955+
def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
5956+
def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
5957+
def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
5958+
def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
59595959
}
59605960

59615961
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)