@@ -8980,6 +8980,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
8980
8980
case GenISAIntrinsic::GenISA_WaveClustered:
8981
8981
emitWaveClustered(inst);
8982
8982
break;
8983
+ case GenISAIntrinsic::GenISA_WaveClusteredInterleave:
8984
+ emitWaveClusteredInterleave(inst);
8985
+ break;
8983
8986
case GenISAIntrinsic::GenISA_dp4a_ss:
8984
8987
case GenISAIntrinsic::GenISA_dp4a_uu:
8985
8988
case GenISAIntrinsic::GenISA_dp4a_su:
@@ -13802,6 +13805,8 @@ void EmitPass::emitReductionClustered(const e_opcode op, const uint64_t identity
13802
13805
}
13803
13806
}
13804
13807
13808
+ // Emits interleave reduction, first preparing the input data. This guarantees to produce
13809
+ // correct result even if not all lanes are active.
13805
13810
void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
13806
13811
const bool negate, const unsigned int step, CVariable* const src, CVariable* const dst)
13807
13812
{
@@ -13819,17 +13824,31 @@ void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identit
13819
13824
13820
13825
CVariable* srcH1 = ScanReducePrepareSrc(type, identityValue, negate, false /* secondHalf */,
13821
13826
src, nullptr /* dst */);
13822
- CVariable* temp = srcH1;
13827
+
13828
+ CVariable* srcH2 = nullptr;
13829
+ if (firstStep == 16 && m_currShader->m_numberInstance > 1)
13830
+ {
13831
+ srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
13832
+ src, nullptr /* dst */);
13833
+ }
13834
+
13835
+ emitReductionInterleave(op, type, m_currShader->m_SIMDSize, step, false, srcH1, srcH2, dst);
13836
+ }
13837
+
13838
+ // Directly emits interleave reduction on input data, without preparing the input.
13839
+ void EmitPass::emitReductionInterleave(const e_opcode op, const VISA_Type type, const SIMDMode simd,
13840
+ const unsigned int step, const bool noMaskBroadcast, CVariable* const src1, CVariable* const src2, CVariable* const dst)
13841
+ {
13842
+ const uint16_t firstStep = m_currShader->m_numberInstance * numLanes(simd) / 2;
13843
+
13844
+ CVariable* temp = src1;
13823
13845
13824
13846
// Implementation is similar to emitReductionAll(), but we stop reduction before reaching SIMD1.
13825
13847
for (unsigned int currentStep = firstStep; currentStep >= step; currentStep >>= 1)
13826
13848
{
13827
13849
if (currentStep == 16 && m_currShader->m_numberInstance > 1)
13828
13850
{
13829
- CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
13830
- src, nullptr /* dst */);
13831
-
13832
- temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp, srcH2);
13851
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp, src2);
13833
13852
}
13834
13853
else
13835
13854
{
@@ -13838,15 +13857,18 @@ void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identit
13838
13857
}
13839
13858
13840
13859
// Broadcast result
13860
+ if (noMaskBroadcast)
13861
+ m_encoder->SetNoMask();
13862
+
13841
13863
// For XeHP, for low interleave step, broadcast of 64-bit result
13842
13864
// can be optimized as a separate mov of low/high 32-bit.
13843
13865
bool use32bitMove = ScanReduceIs64BitType(type) && m_currShader->m_Platform->doScalar64bScan() && m_currShader->m_numberInstance == 1;
13844
13866
if (use32bitMove && (step == 2 || step == 4))
13845
13867
{
13846
13868
CVariable* result32b = m_currShader->GetNewAlias(temp, ISA_TYPE_UD, 0, 2 * step);
13847
- CVariable* dst32b = m_currShader->GetNewAlias(dst, ISA_TYPE_UD, 0, 2 * numLanes(m_currShader->m_SIMDSize ));
13869
+ CVariable* dst32b = m_currShader->GetNewAlias(dst, ISA_TYPE_UD, 0, 2 * numLanes(simd ));
13848
13870
13849
- m_encoder->SetSimdSize(m_currShader->m_SIMDSize );
13871
+ m_encoder->SetSimdSize(simd );
13850
13872
m_encoder->SetSrcRegion(0, 0, step, 2);
13851
13873
m_encoder->SetDstRegion(2);
13852
13874
m_encoder->Copy(dst32b, result32b);
@@ -13859,7 +13881,7 @@ void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identit
13859
13881
return;
13860
13882
}
13861
13883
13862
- m_encoder->SetSimdSize(m_currShader->m_SIMDSize );
13884
+ m_encoder->SetSimdSize(simd );
13863
13885
m_encoder->SetSrcRegion(0, 0, step, 1);
13864
13886
m_encoder->Copy(dst, temp);
13865
13887
if (m_currShader->m_numberInstance > 1)
@@ -13871,6 +13893,119 @@ void EmitPass::emitReductionInterleave(const e_opcode op, const uint64_t identit
13871
13893
m_encoder->Push();
13872
13894
}
13873
13895
13896
+ void EmitPass::emitReductionClusteredInterleave(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
13897
+ const bool negate, const unsigned int clusterSize, const unsigned int interleaveStep, CVariable* const src, CVariable* const dst)
13898
+ {
13899
+ IGC_ASSERT_MESSAGE(!dst->IsUniform(), "Unsupported: dst must be non-uniform");
13900
+
13901
+ auto simd = m_currShader->m_SIMDSize;
13902
+ auto dataSizeInBytes = CEncoder::GetCISADataTypeSize(type);
13903
+
13904
+ // If src spans 4 GRFs and cluster spans 2 GRFs (2 clusters total), then WaveClusterInterleave can be expressed
13905
+ // as 2 x WaveInterleave, one for each pair of GRFs.
13906
+ if (m_currShader->m_numberInstance == 1 && 2 * clusterSize == numLanes(simd) &&
13907
+ numLanes(simd) * dataSizeInBytes == 4 * m_currShader->getGRFSize())
13908
+ {
13909
+ auto interleaveLanes = numLanes(simd) / 2;
13910
+ SIMDMode interleaveSIMD = lanesToSIMDMode(interleaveLanes);
13911
+
13912
+ for (int i = 0; i < 2; ++i)
13913
+ {
13914
+ CVariable* srcAlias = m_currShader->GetNewAlias(src, type, i * interleaveLanes * dataSizeInBytes, interleaveLanes);
13915
+ CVariable* dstAlias = m_currShader->GetNewAlias(dst, type, i * interleaveLanes * dataSizeInBytes, interleaveLanes);
13916
+
13917
+ emitReductionInterleave(op, type, interleaveSIMD, interleaveStep, true, srcAlias, nullptr, dstAlias);
13918
+ }
13919
+
13920
+ return;
13921
+ }
13922
+
13923
+ // Implementation for each case is custom, with no general solution.
13924
+
13925
+ if (m_currShader->m_numberInstance == 1 && simd == SIMDMode::SIMD32 && dataSizeInBytes == 4 && clusterSize == 16 && interleaveStep == 2)
13926
+ {
13927
+ CVariable* temp = m_currShader->GetNewVariable(numLanes(simd), type, EALIGN_GRF, false, "reduceSrc");
13928
+
13929
+ // Reorder input. Spread every value by two lanes.
13930
+ //
13931
+ // | 0 | 16 | 1 | 17 | 2 | 18 | ... | 15 | 31 |
13932
+ for (int i = 0; i < 2; ++i)
13933
+ {
13934
+ m_encoder->SetNoMask();
13935
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
13936
+ m_encoder->SetSrcRegion(0, 1, 1, 0);
13937
+ m_encoder->SetSrcSubReg(0, 16 * i);
13938
+ m_encoder->SetDstRegion(2);
13939
+ m_encoder->SetDstSubReg(i);
13940
+ m_encoder->Copy(temp, src);
13941
+ m_encoder->Push();
13942
+ }
13943
+
13944
+ // Reduce.
13945
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp);
13946
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
13947
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD4, temp);
13948
+
13949
+ // Propagate output. Repeat each value 8 times.
13950
+ // temp: | a | b | c | d |
13951
+ // dst: | a | c | a | c | a | c | a | c | ... | b | d | b | d | b | d | b | d |
13952
+ for (int i = 0; i < 2; ++i)
13953
+ {
13954
+ m_encoder->SetNoMask();
13955
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
13956
+ m_encoder->SetSrcRegion(0, 1, 8, 0);
13957
+ m_encoder->SetSrcSubReg(0, 2 * i);
13958
+ m_encoder->SetDstRegion(2);
13959
+ m_encoder->SetDstSubReg(i);
13960
+ m_encoder->Copy(dst, temp);
13961
+ m_encoder->Push();
13962
+ }
13963
+ }
13964
+ else if (m_currShader->m_numberInstance == 1 && simd == SIMDMode::SIMD32 && dataSizeInBytes == 4 && clusterSize == 8 && interleaveStep == 2)
13965
+ {
13966
+ CVariable* temp = m_currShader->GetNewVariable(numLanes(simd), type, EALIGN_GRF, false, "reduceSrc");
13967
+
13968
+ // Reorder input. Spread every next two values by 8 lanes:
13969
+ //
13970
+ // | 0 | 1 | 8 | 9 | 16 | 17 | ... | 14 | 15 | 22 | 23 | 30 | 31 |
13971
+ for (int i = 0; i < 4; ++i)
13972
+ {
13973
+ m_encoder->SetNoMask();
13974
+ m_encoder->SetSimdSize(SIMDMode::SIMD8);
13975
+ m_encoder->SetSrcRegion(0, 8, 2, 1);
13976
+ m_encoder->SetSrcSubReg(0, 2 * i);
13977
+ m_encoder->SetDstRegion(1);
13978
+ m_encoder->SetDstSubReg(8 * i);
13979
+ m_encoder->Copy(temp, src);
13980
+ m_encoder->Push();
13981
+ }
13982
+
13983
+ // Reduce.
13984
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD16, temp);
13985
+ temp = ReductionReduceHelper(op, type, SIMDMode::SIMD8, temp);
13986
+
13987
+ // Propagate output. Repeat each pair of values 4 times.
13988
+ //
13989
+ // temp: | a | b | c | d | e | f | g | h |
13990
+ // dst: | a | b | a | b | a | b | a | b | ... | g | h | g | h | g | h | g | h |
13991
+ for (int i = 0; i < 2; ++i)
13992
+ {
13993
+ m_encoder->SetNoMask();
13994
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
13995
+ m_encoder->SetSrcRegion(0, 2, 4, 0);
13996
+ m_encoder->SetSrcSubReg(0, i);
13997
+ m_encoder->SetDstRegion(2);
13998
+ m_encoder->SetDstSubReg(i);
13999
+ m_encoder->Copy(dst, temp);
14000
+ m_encoder->Push();
14001
+ }
14002
+ }
14003
+ else
14004
+ {
14005
+ IGC_ASSERT_MESSAGE(false, "Invalid WaveClusteredInterleave.");
14006
+ }
14007
+ }
14008
+
13874
14009
// do prefix op across all activate channels
13875
14010
void EmitPass::emitPreOrPostFixOp(
13876
14011
e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc,
@@ -21384,6 +21519,30 @@ void EmitPass::emitWaveInterleave(llvm::GenIntrinsicInst* inst)
21384
21519
}
21385
21520
}
21386
21521
21522
+ void EmitPass::emitWaveClusteredInterleave(llvm::GenIntrinsicInst* inst)
21523
+ {
21524
+ bool disableHelperLanes = int_cast<int>(cast<ConstantInt>(inst->getArgOperand(3))->getSExtValue()) == 2;
21525
+ if (disableHelperLanes)
21526
+ {
21527
+ ForceDMask();
21528
+ }
21529
+ CVariable* src = GetSymbol(inst->getOperand(0));
21530
+ const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
21531
+ const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
21532
+ const unsigned int interleaveStep = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(3))->getZExtValue());
21533
+ VISA_Type type;
21534
+ e_opcode opCode;
21535
+ uint64_t identity = 0;
21536
+ GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
21537
+ CVariable* dst = m_destination;
21538
+ m_encoder->SetSubSpanDestination(false);
21539
+ emitReductionClusteredInterleave(opCode, identity, type, false, clusterSize, interleaveStep, src, dst);
21540
+ if (disableHelperLanes)
21541
+ {
21542
+ ResetVMask();
21543
+ }
21544
+ }
21545
+
21387
21546
void EmitPass::emitDP4A(GenIntrinsicInst* GII, const SSource* Sources, const DstModifier& modifier, bool isAccSigned) {
21388
21547
GenISAIntrinsic::ID GIID = GII->getIntrinsicID();
21389
21548
CVariable* dst = m_destination;
0 commit comments