-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LV] Vectorize FMax via OrderedFCmpSelect w/o fast-math flags. #146711
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 4 commits
9d28282
5675396
ec473e5
caae126
92ebac1
25a1f39
d50a372
43ff8ed
a3ae508
52b72e7
88e581e
b92dde2
e59025f
d3a4ca9
6605452
32baf05
02156d9
6793b20
f579116
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,9 @@ enum class RecurKind { | |
FMul, ///< Product of floats. | ||
FMin, ///< FP min implemented in terms of select(cmp()). | ||
FMax, ///< FP max implemented in terms of select(cmp()). | ||
FCmpOGTSelect, ///< FP max implemented in terms of select(cmp()), but without | ||
/// any fast-math flags. Users need to handle NaNs and signed | ||
/// zeros when generating code. | ||
|
||
FMinimum, ///< FP min with llvm.minimum semantics | ||
FMaximum, ///< FP max with llvm.maximum semantics | ||
FMinimumNum, ///< FP min with llvm.minimumnum semantics | ||
|
@@ -250,8 +253,9 @@ class RecurrenceDescriptor { | |
/// Returns true if the recurrence kind is a floating-point min/max kind. | ||
static bool isFPMinMaxRecurrenceKind(RecurKind Kind) { | ||
return Kind == RecurKind::FMin || Kind == RecurKind::FMax || | ||
Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum || | ||
Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum; | ||
Kind == RecurKind::FCmpOGTSelect || Kind == RecurKind::FMinimum || | ||
Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimumNum || | ||
Kind == RecurKind::FMaximumNum; | ||
} | ||
|
||
/// Returns true if the recurrence kind is any min/max kind. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -819,7 +819,8 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind, | |
if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value()))) | ||
return InstDesc(Kind == RecurKind::FMin, I); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only max is handled by OrderedFCmpSelect, not min? Can start w/ FMaxOGT only. |
||
if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value()))) | ||
return InstDesc(Kind == RecurKind::FMax, I); | ||
return InstDesc(Kind == RecurKind::FMax || Kind == RecurKind::FCmpOGTSelect, | ||
I); | ||
if (match(I, m_FMinNum(m_Value(), m_Value()))) | ||
return InstDesc(Kind == RecurKind::FMin, I); | ||
if (match(I, m_FMaxNum(m_Value(), m_Value()))) | ||
|
@@ -941,10 +942,15 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( | |
m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) || | ||
match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value())); | ||
}; | ||
if (isIntMinMaxRecurrenceKind(Kind) || | ||
(HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) | ||
if (isIntMinMaxRecurrenceKind(Kind)) | ||
return isMinMaxPattern(I, Kind, Prev); | ||
else if (isFMulAddIntrinsic(I)) | ||
if (isFPMinMaxRecurrenceKind(Kind)) { | ||
if (HasRequiredFMF()) | ||
return isMinMaxPattern(I, Kind, Prev); | ||
if ((Kind == RecurKind::FMax || Kind == RecurKind::FCmpOGTSelect) && | ||
isMinMaxPattern(I, Kind, Prev).isRecurrence()) | ||
return InstDesc(I, RecurKind::FCmpOGTSelect); | ||
} else if (isFMulAddIntrinsic(I)) | ||
return InstDesc(Kind == RecurKind::FMulAdd, I, | ||
I->hasAllowReassoc() ? nullptr : I); | ||
return InstDesc(false, I); | ||
|
@@ -1207,6 +1213,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { | |
case RecurKind::UMin: | ||
return Instruction::ICmp; | ||
case RecurKind::FMax: | ||
case RecurKind::FCmpOGTSelect: | ||
case RecurKind::FMin: | ||
case RecurKind::FMaximum: | ||
case RecurKind::FMinimum: | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -25,6 +25,7 @@ | |||||||||||||||||||||||||||||||||||||||||||
#define DEBUG_TYPE "vplan" | ||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||
using namespace llvm; | ||||||||||||||||||||||||||||||||||||||||||||
using namespace VPlanPatternMatch; | ||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||
namespace { | ||||||||||||||||||||||||||||||||||||||||||||
// Class that is used to build the plain CFG for the incoming IR. | ||||||||||||||||||||||||||||||||||||||||||||
|
@@ -427,7 +428,6 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) { | |||||||||||||||||||||||||||||||||||||||||||
static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, | ||||||||||||||||||||||||||||||||||||||||||||
VPBasicBlock *LatchVPBB, Type *IdxTy, | ||||||||||||||||||||||||||||||||||||||||||||
DebugLoc DL) { | ||||||||||||||||||||||||||||||||||||||||||||
using namespace VPlanPatternMatch; | ||||||||||||||||||||||||||||||||||||||||||||
Value *StartIdx = ConstantInt::get(IdxTy, 0); | ||||||||||||||||||||||||||||||||||||||||||||
auto *StartV = Plan.getOrAddLiveIn(StartIdx); | ||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||
|
@@ -628,3 +628,118 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, | |||||||||||||||||||||||||||||||||||||||||||
Term->addMetadata(LLVMContext::MD_prof, BranchWeights); | ||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||
bool VPlanTransforms::handleFMaxReductionsWithoutFastMath(VPlan &Plan) { | ||||||||||||||||||||||||||||||||||||||||||||
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); | ||||||||||||||||||||||||||||||||||||||||||||
VPReductionPHIRecipe *RedPhiR = nullptr; | ||||||||||||||||||||||||||||||||||||||||||||
VPRecipeWithIRFlags *MaxOp = nullptr; | ||||||||||||||||||||||||||||||||||||||||||||
VPWidenIntOrFpInductionRecipe *WideIV = nullptr; | ||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||
// Check if there are any FCmpOGTSelect reductions using wide selects that we | ||||||||||||||||||||||||||||||||||||||||||||
// can fix up. To do so, we also need a wide canonical IV to keep track of | ||||||||||||||||||||||||||||||||||||||||||||
|
// can fix up. To do so, we also need a wide canonical IV to keep track of | |
// can fix up. To do so, we also need a wide canonical IV to keep track of |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that a scalar canonical IV always exists, and is unique. But widen ones may exist (last one found is used?) or not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, at this stage, all inductions will still be widened, but may not be canonical.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Create a reduction that tracks the first indices where the latest maximum | |
// Create a reduction that tracks the first indices where the running maximum |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// the partial reductions in a way that correctly handles signed zeros and | |
// NaNs in the input. | |
// the partial reductions in a way that correctly handles a signed zero maximum. |
(NaN's nor non-zero numbers do not require the tracked indices.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tried to elaborate, is this clear/correct/helpful?
// Note that we do not need to check if the induction may hit the sentinel | |
// value. If the sentinel value gets hit, the final reduction value is at the | |
// last index or the maximum was never set and all lanes contain the start | |
// value. In either case, the correct value is selected. | |
// Note that we do not need to check if a final index holds the initial | |
// sentinel value (max uint). This is because of the following argument. | |
// The index is used to select one among several lanes holding the same | |
// maximum value, in order to select the correct sign in case that value is 0. | |
// So only indices of lanes holding the total maximum value are of interest. | |
// If a final index holds the sentinel value then either the maximum of the | |
// corresponding lane appeared first at that index, or this maximum was never | |
// updated and still retains the start value. The former case is clearly fine. | |
// In the latter case the start value is either NaN or a number greater or | |
// equal to all elements of that lane. If the start value is NaN all lanes | |
// eventually hold NaN as their maximum value, a tie to be broken arbitrarily. | |
// If the start value is equal to the total maximum value, then the maxima of | |
// all lanes are equal to the start value, including its sign, again a tie to | |
// be broken arbitrarily. | |
// If the start value differs from (is less than) the total maximum value, | |
// then an index holding the sentinel value corresponds to a non maximum lane | |
// and is thus irrelevant for tie breaking. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth commenting constant parameters.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{MaxOp->getOperand(0), WideIV, IdxPhi}); | |
{Cmp, WideIV, IdxPhi}); |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Find the first index of with the maximum value. This is used to extract the | |
// Find the first index holding the maximum value. This is used to extract the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPBuilder Builder(MaxResult->getParent(), | |
assert(MaxResult && "Missing a ComputeReductionResult user"); | |
VPBuilder Builder(MaxResult->getParent(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The vertical computation of each partial reduction result takes care of NaNs and signed zeroes, it is only the horizontal reduction of these vector lanes that require tie-breaking, to handle potential signed zeroes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the tie-breaking is only needed to handle signed zeroes when computing the final reduction results.
Consider a final partial reduction vector with -0.0, +0.0
and -0.0 was encountered before +0.0 (e.g. the max at iteration 2 is -0.0 and at iteration 3 it is +0.0. Doing a plain horizontal fmax reduction will produce +0.0 (-0.0 < +0.0).
We then compare the partial reduction values to the result of the horizontal reduction (-0.0 == +0.0 will also be true, selecting all lanes with zeros of any signed-ness)
Out of those, we select the one encountered first using FindFirstIV. Note that this only works for strict predicates.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The naming issue was admittedly raised before. This is still documented and titled as dealing with "FP max", and also affects loops doing
fcmp ugt
andselect
as in a testcase at the bottom. If RecurKind tries to capture the explicit pattern in the input IR, it may need to accommodate a variety of compare predicates. Signed and unsigned max and min try, OTOH, to abstract cmp/select pairs having a variety of lt/le/gt/ge predicated and same or reversed operands. Is this aiming to handle Ordered and/or Unordered FMax reduction of sets that may include NaNs (in terms of select(cmp()))?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, this for now should be restricted to selects with ordered compares for now, so the result is only NaN if the start value is NaN.
It is not restricted to OGT, OLT also works. It needs to be a strict predicate, to use FindFirstIV. For non-strict ones we would have to use FindLastIV.
Updated to OrderedFCmpSelect, wdyt?