@@ -7280,6 +7280,93 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
72807280 return DAG.getBitcast(VT, UnrolledLaneOp);
72817281}
72827282
7283+ // Right now, only subgroup.shuffle implemented, but other
7284+ // future subgroup ops can use this function too
7285+ static SDValue lowerSubgroupOp(const SITargetLowering &TLI, SDNode *N,
7286+ SelectionDAG &DAG) {
7287+ EVT VT = N->getValueType(0);
7288+ unsigned ValSize = VT.getSizeInBits();
7289+ unsigned IID = N->getConstantOperandVal(0);
7290+ SDLoc SL(N);
7291+
7292+ SDValue Value = N->getOperand(1);
7293+ SDValue Index = N->getOperand(2);
7294+
7295+ // ds_bpermute requires index to be multiplied by 4
7296+ SDValue ShiftAmount = DAG.getTargetConstant(2, SL, MVT::i32);
7297+ SDValue ShiftedIndex = DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index,
7298+ ShiftAmount);
7299+
7300+ // Intrinsics will require i32 to operate on
7301+ SDValue Value32 = Value;
7302+ if ((ValSize != 32) || (VT.isFloatingPoint()))
7303+ Value32 = DAG.getBitcast(MVT::i32, Value);
7304+
7305+ auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7306+ SmallVector<SDValue> IntrinArgs) -> SDValue {
7307+ SmallVector<SDValue> Operands(1);
7308+ Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7309+ Operands.append(IntrinArgs);
7310+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7311+ };
7312+
7313+ switch (IID) {
7314+ case Intrinsic::amdgcn_subgroup_shuffle:
7315+ if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7316+ // If we can bpermute across the whole wave, then just do that
7317+ SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7318+ MVT::i32, {ShiftedIndex, Value32});
7319+ return DAG.getBitcast(VT, BPermute);
7320+ } else {
7321+ assert(TLI.getSubtarget()->isWave64());
7322+
7323+ // Otherwise, we need to make use of whole wave mode
7324+ SDValue PoisonVal = DAG.getPOISON(Value32->getValueType(0));
7325+ SDValue PoisonIndex = DAG.getPOISON(ShiftedIndex->getValueType(0));
7326+
7327+ // Set inactive lanes to poison
7328+ SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7329+ MVT::i32, {Value32, PoisonVal});
7330+ SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive,
7331+ MVT::i32, {ShiftedIndex, PoisonIndex});
7332+
7333+ SDValue Swapped = MakeIntrinsic(Intrinsic::amdgcn_permlane64,
7334+ MVT::i32, {WWMValue});
7335+
7336+ // Get permutation of each half, then we'll select which one to use
7337+ SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7338+ MVT::i32, {WWMIndex, WWMValue});
7339+ SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7340+ MVT::i32, {WWMIndex, Swapped});
7341+ SDValue BPermOtherHalfWWM = MakeIntrinsic(Intrinsic::amdgcn_wwm,
7342+ MVT::i32, {BPermOtherHalf});
7343+
7344+ // Select which side to take the permute from
7345+ SDValue ThreadIDMask = DAG.getTargetConstant(UINT32_MAX, SL, MVT::i32);
7346+ SDValue ThreadIDLo = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7347+ {ThreadIDMask,
7348+ DAG.getTargetConstant(0, SL,
7349+ MVT::i32)});
7350+ SDValue ThreadID = MakeIntrinsic(Intrinsic::amdgcn_mbcnt_hi, MVT::i32,
7351+ {ThreadIDMask, ThreadIDLo});
7352+
7353+ SDValue SameOrOtherHalf = DAG.getNode(ISD::AND, SL, MVT::i32,
7354+ DAG.getNode(ISD::XOR, SL, MVT::i32,
7355+ ThreadID, Index),
7356+ DAG.getTargetConstant(32, SL,
7357+ MVT::i32));
7358+ SDValue UseSameHalf = DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7359+ DAG.getConstant(0, SL, MVT::i32),
7360+ ISD::SETEQ);
7361+ SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf,
7362+ BPermSameHalf, BPermOtherHalfWWM);
7363+ return DAG.getBitcast(VT, Result);
7364+ }
7365+ default:
7366+ return SDValue();
7367+ }
7368+ }
7369+
72837370void SITargetLowering::ReplaceNodeResults(SDNode *N,
72847371 SmallVectorImpl<SDValue> &Results,
72857372 SelectionDAG &DAG) const {
@@ -10187,6 +10274,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1018710274 Poisons.push_back(DAG.getPOISON(ValTy));
1018810275 return DAG.getMergeValues(Poisons, SDLoc(Op));
1018910276 }
10277+ case Intrinsic::amdgcn_subgroup_shuffle:
10278+ return lowerSubgroupOp(*this, Op.getNode(), DAG);
1019010279 default:
1019110280 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1019210281 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
0 commit comments