Skip to content

Commit dc3dba5

Browse files
committed
[Intrinsics][AArch64] Add intrinsic to mask off aliasing vector lanes
It can be unsafe to load a vector from an address and write a vector to an address if those two addresses have overlapping lanes within a vectorised loop iteration. This PR adds an intrinsic designed to create a mask with lanes disabled if they overlap between the two pointer arguments, so that only safe lanes are loaded, operated on and stored. Along with the two pointer parameters, the intrinsic also takes an immediate that represents the size in bytes of the vector element types, as well as an immediate i1 that is true if there is a write after-read-hazard or false if there is a read-after-write hazard. This will be used by llvm#100579 and replaces the existing lowering for whilewr since that isn't needed now we have the intrinsic.
1 parent 71b87d1 commit dc3dba5

File tree

11 files changed

+714
-1216
lines changed

11 files changed

+714
-1216
lines changed

llvm/docs/LangRef.rst

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23475,6 +23475,86 @@ Examples:
2347523475
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
2347623476
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
2347723477

23478+
.. _int_experimental_get_alias_lane_mask:
23479+
23480+
'``llvm.get.alias.lane.mask.*``' Intrinsics
23481+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23482+
23483+
Syntax:
23484+
"""""""
23485+
This is an overloaded intrinsic.
23486+
23487+
::
23488+
23489+
declare <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23490+
declare <8 x i1> @llvm.experimental.get.alias.lane.mask.v8i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23491+
declare <16 x i1> @llvm.experimental.get.alias.lane.mask.v16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23492+
declare <vscale x 16 x i1> @llvm.experimental.get.alias.lane.mask.nxv16i1.i64(i64 %ptrA, i64 %ptrB, i32 immarg %elementSize, i1 immarg %writeAfterRead)
23493+
23494+
23495+
Overview:
23496+
"""""""""
23497+
23498+
Create a mask representing lanes that do or not overlap between two pointers across one vector loop iteration.
23499+
23500+
23501+
Arguments:
23502+
""""""""""
23503+
23504+
The first two arguments have the same scalar integer type.
23505+
The final two are immediates and the result is a vector with the i1 element type.
23506+
23507+
Semantics:
23508+
""""""""""
23509+
23510+
In the case that ``%writeAfterRead`` is true, the '``llvm.experimental.get.alias.lane.mask.*``' intrinsics are semantically equivalent
23511+
to:
23512+
23513+
::
23514+
23515+
%diff = (%ptrB - %ptrA) / %elementSize
23516+
%m[i] = (icmp ult i, %diff) || (%diff <= 0)
23517+
23518+
Otherwise they are semantically equivalent to:
23519+
23520+
::
23521+
23522+
%diff = abs(%ptrB - %ptrA) / %elementSize
23523+
%m[i] = (icmp ult i, %diff) || (%diff == 0)
23524+
23525+
where ``%m`` is a vector (mask) of active/inactive lanes with its elements
23526+
indexed by ``i``, and ``%ptrA``, ``%ptrB`` are the two i64 arguments to
23527+
``llvm.experimental.get.alias.lane.mask.*``, ``%elementSize`` is the i32 argument, ``%abs`` is the absolute difference operation, ``%icmp`` is an integer compare and ``ult``
23528+
the unsigned less-than comparison operator. The subtraction between ``%ptrA`` and ``%ptrB`` could be negative. The ``%writeAfterRead`` argument is expected to be true if the ``%ptrB`` is stored to after ``%ptrA`` is read from.
23529+
The above is equivalent to:
23530+
23531+
::
23532+
23533+
%m = @llvm.experimental.get.alias.lane.mask(%ptrA, %ptrB, %elementSize, %writeAfterRead)
23534+
23535+
This can, for example, be emitted by the loop vectorizer in which case
23536+
``%ptrA`` is a pointer that is read from within the loop, and ``%ptrB`` is a pointer that is stored to within the loop.
23537+
If the difference between these pointers is less than the vector factor, then they overlap (alias) within a loop iteration.
23538+
An example is if ``%ptrA`` is 20 and ``%ptrB`` is 23 with a vector factor of 8, then lanes 3, 4, 5, 6 and 7 of the vector loaded from ``%ptrA``
23539+
share addresses with lanes 0, 1, 2, 3, 4 and 5 from the vector stored to at ``%ptrB``.
23540+
An alias mask of these two pointers should be <1, 1, 1, 0, 0, 0, 0, 0> so that only the non-overlapping lanes are loaded and stored.
23541+
This operation allows many loops to be vectorised when it would otherwise be unsafe to do so.
23542+
23543+
To account for the fact that only a subset of lanes have been operated on in an iteration,
23544+
the loop's induction variable should be incremented by the popcount of the mask rather than the vector factor.
23545+
23546+
This mask ``%m`` can e.g. be used in masked load/store instructions.
23547+
23548+
23549+
Examples:
23550+
"""""""""
23551+
23552+
.. code-block:: llvm
23553+
23554+
%alias.lane.mask = call <4 x i1> @llvm.experimental.get.alias.lane.mask.v4i1.i64(i64 %ptrA, i64 %ptrB, i32 4, i1 1)
23555+
%vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptrA, i32 4, <4 x i1> %alias.lane.mask, <4 x i32> poison)
23556+
[...]
23557+
call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, <4 x i32>* %ptrB, i32 4, <4 x i1> %alias.lane.mask)
2347823558

2347923559
.. _int_experimental_vp_splice:
2348023560

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,11 @@ class TargetLoweringBase {
468468
return true;
469469
}
470470

471+
/// Return true if the @llvm.experimental.get.alias.lane.mask intrinsic should be expanded using generic code in SelectionDAGBuilder.
472+
virtual bool shouldExpandGetAliasLaneMask(EVT VT, EVT PtrVT, unsigned EltSize) const {
473+
return true;
474+
}
475+
471476
virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF,
472477
bool IsScalable) const {
473478
return true;

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2363,6 +2363,11 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>
23632363
llvm_i32_ty]>;
23642364
}
23652365

2366+
def int_experimental_get_alias_lane_mask:
2367+
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
2368+
[llvm_anyint_ty, LLVMMatchType<1>, llvm_anyint_ty, llvm_i1_ty],
2369+
[IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
2370+
23662371
def int_get_active_lane_mask:
23672372
DefaultAttrsIntrinsic<[llvm_anyvector_ty],
23682373
[llvm_anyint_ty, LLVMMatchType<1>],

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8284,6 +8284,50 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
82848284
visitVectorExtractLastActive(I, Intrinsic);
82858285
return;
82868286
}
8287+
case Intrinsic::experimental_get_alias_lane_mask: {
8288+
SDValue SourceValue = getValue(I.getOperand(0));
8289+
SDValue SinkValue = getValue(I.getOperand(1));
8290+
SDValue EltSize = getValue(I.getOperand(2));
8291+
bool IsWriteAfterRead = cast<ConstantSDNode>(getValue(I.getOperand(3)))->getZExtValue() != 0;
8292+
auto IntrinsicVT = EVT::getEVT(I.getType());
8293+
auto PtrVT = SourceValue->getValueType(0);
8294+
8295+
if (!TLI.shouldExpandGetAliasLaneMask(IntrinsicVT, PtrVT, cast<ConstantSDNode>(EltSize)->getSExtValue())) {
8296+
visitTargetIntrinsic(I, Intrinsic);
8297+
return;
8298+
}
8299+
8300+
SDValue Diff = DAG.getNode(ISD::SUB, sdl,
8301+
PtrVT, SinkValue, SourceValue);
8302+
if (!IsWriteAfterRead)
8303+
Diff = DAG.getNode(ISD::ABS, sdl, PtrVT, Diff);
8304+
8305+
Diff = DAG.getNode(ISD::SDIV, sdl, PtrVT, Diff, EltSize);
8306+
SDValue Zero = DAG.getTargetConstant(0, sdl, PtrVT);
8307+
8308+
// If the difference is positive then some elements may alias
8309+
auto CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
8310+
PtrVT);
8311+
SDValue Cmp = DAG.getSetCC(sdl, CmpVT, Diff, Zero, IsWriteAfterRead ? ISD::SETLE : ISD::SETEQ);
8312+
8313+
// Splat the compare result then OR it with a lane mask
8314+
SDValue Splat = DAG.getSplat(IntrinsicVT, sdl, Cmp);
8315+
8316+
SDValue DiffMask;
8317+
// Don't emit an active lane mask if the target doesn't support it
8318+
if (TLI.shouldExpandGetActiveLaneMask(IntrinsicVT, PtrVT)) {
8319+
EVT VecTy = EVT::getVectorVT(*DAG.getContext(), PtrVT,
8320+
IntrinsicVT.getVectorElementCount());
8321+
SDValue DiffSplat = DAG.getSplat(VecTy, sdl, Diff);
8322+
SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
8323+
DiffMask = DAG.getSetCC(sdl, IntrinsicVT, VectorStep,
8324+
DiffSplat, ISD::CondCode::SETULT);
8325+
} else {
8326+
DiffMask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, IntrinsicVT, DAG.getTargetConstant(Intrinsic::get_active_lane_mask, sdl, MVT::i64), Zero, Diff);
8327+
}
8328+
SDValue Or = DAG.getNode(ISD::OR, sdl, IntrinsicVT, DiffMask, Splat);
8329+
setValue(&I, Or);
8330+
}
82878331
}
82888332
}
82898333

0 commit comments

Comments
 (0)