-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Add new llvm.amdgcn.wave.shuffle intrinsic #167372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
83f5dd6
741566b
f3f133f
c2d8e0e
016481e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2718,6 +2718,17 @@ def int_amdgcn_call_whole_wave: | |
| llvm_vararg_ty], // The arguments to the callee. | ||
| [IntrConvergent]>; | ||
|
|
||
| // <result> | ||
| // llvm.amdgcn.wave.shuffle <value> <id> | ||
| // value and result can be 32bit floating-point, integer, | ||
| // or Boolean types, and must be the same type. Any index | ||
| // value that's outside the valid range will wrap around, | ||
| // and reading from an inactive lane will return 0. | ||
|
||
| def int_amdgcn_wave_shuffle : | ||
| DefaultAttrsIntrinsic<[llvm_any_ty], // return types | ||
| [LLVMMatchType<0>, llvm_i32_ty], // arg types | ||
| [IntrConvergent, IntrNoMem, IntrNoFree, IntrWillReturn, IntrNoCallback]>; // flags | ||
saxlungs marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| //===----------------------------------------------------------------------===// | ||
| // CI+ Intrinsics | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7280,6 +7280,85 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, | |
| return DAG.getBitcast(VT, UnrolledLaneOp); | ||
| } | ||
|
|
||
| static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, | ||
| SelectionDAG &DAG) { | ||
| EVT VT = N->getValueType(0); | ||
| unsigned ValSize = VT.getSizeInBits(); | ||
| assert(ValSize == 32); | ||
saxlungs marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| SDLoc SL(N); | ||
|
|
||
| SDValue Value = N->getOperand(1); | ||
| SDValue Index = N->getOperand(2); | ||
|
|
||
| // ds_bpermute requires index to be multiplied by 4 | ||
| SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL); | ||
| SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, | ||
| ShiftAmount); | ||
|
|
||
| // Intrinsics will require i32 to operate on | ||
| SDValue ValueI32 = Value; | ||
| if (VT.isFloatingPoint()) | ||
|
||
| ValueI32 = DAG.getBitcast(MVT::i32, Value); | ||
|
|
||
| auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT, | ||
| SmallVector<SDValue> IntrinArgs) -> SDValue { | ||
| SmallVector<SDValue> Operands(1); | ||
| Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32); | ||
| Operands.append(IntrinArgs); | ||
| return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands); | ||
| }; | ||
|
|
||
| // If we can bpermute across the whole wave, then just do that | ||
| if (TLI.getSubtarget()->supportsWaveWideBPermute()) { | ||
| SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32, | ||
| {ShiftedIndex, ValueI32}); | ||
| return DAG.getBitcast(VT, BPermute); | ||
| } | ||
|
|
||
| assert(TLI.getSubtarget()->isWave64()); | ||
|
|
||
| // Otherwise, we need to make use of whole wave mode | ||
| SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0)); | ||
| SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0)); | ||
|
||
|
|
||
| // Set inactive lanes to poison | ||
| SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, | ||
| {ValueI32, PoisonVal}); | ||
| SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32, | ||
| {ShiftedIndex, PoisonIndex}); | ||
|
|
||
| SDValue Swapped = | ||
| MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue}); | ||
|
|
||
| // Get permutation of each half, then we'll select which one to use | ||
| SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, | ||
| MVT::i32, {WWMIndex, WWMValue}); | ||
| SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, | ||
| MVT::i32, {WWMIndex, Swapped}); | ||
| SDValue BPermOtherHalfWWM = | ||
| MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf}); | ||
|
|
||
| // Select which side to take the permute from | ||
| SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32); | ||
| // We can get away with only using mbcnt_lo here since we're only | ||
| // trying to detect which side of 32 each lane is on, and mbcnt_lo | ||
| // returns 32 for lanes 32-63. | ||
| SDValue ThreadID = | ||
| MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32, | ||
| {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)}); | ||
|
|
||
| SDValue SameOrOtherHalf = | ||
| DAG.getNode(ISD::AND, SL, MVT::i32, | ||
| DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index), | ||
| DAG.getTargetConstant(32, SL, MVT::i32)); | ||
| SDValue UseSameHalf = | ||
| DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf, | ||
| DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ); | ||
| SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf, | ||
| BPermOtherHalfWWM); | ||
| return DAG.getBitcast(VT, Result); | ||
| } | ||
|
|
||
| void SITargetLowering::ReplaceNodeResults(SDNode *N, | ||
| SmallVectorImpl<SDValue> &Results, | ||
| SelectionDAG &DAG) const { | ||
|
|
@@ -10187,6 +10266,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, | |
| Poisons.push_back(DAG.getPOISON(ValTy)); | ||
| return DAG.getMergeValues(Poisons, SDLoc(Op)); | ||
| } | ||
| case Intrinsic::amdgcn_wave_shuffle: | ||
| return lowerWaveShuffle(*this, Op.getNode(), DAG); | ||
| default: | ||
| if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = | ||
| AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Boolean" usually means i1. It looks like you only handle i32 and float.