Skip to content

Commit ed5245b

Browse files
pkwasnie-inteligcbot
authored andcommitted
new intrinsic: sub group clustered ballot
Adds new intrinsic: sub group clustered ballot. Works similar to sub group ballot, but each lane contains results only from its' cluster. Only cluster sizes 8 and 16 are supported.
1 parent 950be43 commit ed5245b

File tree

11 files changed

+150
-7
lines changed

11 files changed

+150
-7
lines changed

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ uint __builtin_IB_get_image_bti(uint img);
521521

522522
// ballot intrinsic
523523
uint __builtin_IB_WaveBallot(bool p);
524+
uint __builtin_IB_clustered_WaveBallot(bool p, uint cluster_size);
524525

525526
// VA
526527
void __builtin_IB_va_erode_64x4( __local uchar* dst, float2 coords, int srcImgId, int i_accelerator );

IGC/Compiler/CISACodeGen/CheckInstrTypes.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ void CheckInstrTypes::visitCallInst(CallInst& C)
342342
case GenISAIntrinsic::GenISA_WaveBallot:
343343
case GenISAIntrinsic::GenISA_wavebarrier:
344344
case GenISAIntrinsic::GenISA_WaveInverseBallot:
345+
case GenISAIntrinsic::GenISA_WaveClusteredBallot:
345346
case GenISAIntrinsic::GenISA_WavePrefix:
346347
case GenISAIntrinsic::GenISA_WaveClustered:
347348
case GenISAIntrinsic::GenISA_WaveInterleave:

IGC/Compiler/CISACodeGen/CodeSinking.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2492,6 +2492,7 @@ namespace IGC {
24922492
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
24932493
case GenISAIntrinsic::GenISA_WaveBallot:
24942494
case GenISAIntrinsic::GenISA_WaveInverseBallot:
2495+
case GenISAIntrinsic::GenISA_WaveClusteredBallot:
24952496
case GenISAIntrinsic::GenISA_WaveAll:
24962497
case GenISAIntrinsic::GenISA_WaveClustered:
24972498
case GenISAIntrinsic::GenISA_WaveInterleave:

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 105 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9178,6 +9178,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
91789178
case GenISAIntrinsic::GenISA_WaveInverseBallot:
91799179
emitWaveInverseBallot(inst);
91809180
break;
9181+
case GenISAIntrinsic::GenISA_WaveClusteredBallot:
9182+
emitWaveClusteredBallot(inst);
9183+
break;
91819184
case GenISAIntrinsic::GenISA_WaveShuffleIndex:
91829185
case GenISAIntrinsic::GenISA_WaveBroadcast:
91839186
emitSimdShuffle(inst);
@@ -21726,6 +21729,23 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2172621729
destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
2172721730
}
2172821731

21732+
emitBallotUniform(inst, &destination, disableHelperLanes);
21733+
21734+
if (destination != m_destination)
21735+
{
21736+
m_encoder->Cast(m_destination, destination);
21737+
m_encoder->Push();
21738+
}
21739+
if (disableHelperLanes)
21740+
{
21741+
ResetVMask();
21742+
}
21743+
}
21744+
21745+
void EmitPass::emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable** destination, bool disableHelperLanes)
21746+
{
21747+
IGC_ASSERT_MESSAGE((*destination)->IsUniform(), "Unsupported: dst must be uniform");
21748+
2172921749
bool uniform_active_lane = false;
2173021750
if (ConstantInt * pConst = dyn_cast<ConstantInt>(inst->getOperand(0)))
2173121751
{
@@ -21741,17 +21761,17 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2174121761
if (m_currShader->m_dispatchSize == SIMDMode::SIMD8 && m_currShader->HasFullDispatchMask())
2174221762
{
2174321763
// for SIMD8 make sure the higher 8 bits of the flag are not copied
21744-
destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
21764+
*destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
2174521765
}
21746-
m_encoder->BoolToInt(destination, f0);
21766+
m_encoder->BoolToInt(*destination, f0);
2174721767
if (!m_currShader->HasFullDispatchMask())
2174821768
{
2174921769
CVariable* dispatchMask = m_currShader->GetNewAlias(
2175021770
m_currShader->GetSR0(),
2175121771
ISA_TYPE_UD,
2175221772
(m_pattern->NeedVMask() && !disableHelperLanes ? 3 : 2) * SIZE_DWORD,
2175321773
1);
21754-
m_encoder->And(destination, dispatchMask, destination);
21774+
m_encoder->And(*destination, dispatchMask, *destination);
2175521775
}
2175621776
}
2175721777
else
@@ -21770,21 +21790,99 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2177021790

2177121791
m_encoder->SetSimdSize(SIMDMode::SIMD1);
2177221792
m_encoder->SetNoMask();
21773-
m_encoder->And(destination, exeMask, vf0);
21793+
m_encoder->And(*destination, exeMask, vf0);
2177421794
m_encoder->Push();
2177521795
}
2177621796
else
2177721797
{
21778-
m_encoder->Cast(destination, exeMask);
21798+
m_encoder->Cast(*destination, exeMask);
2177921799
m_encoder->Push();
2178021800
}
2178121801
}
21802+
}
2178221803

21783-
if (destination != m_destination)
21804+
void EmitPass::emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst)
21805+
{
21806+
IGC_ASSERT_MESSAGE(!m_destination->IsUniform(), "Unsupported: dst must be non-uniform");
21807+
21808+
IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(inst->getOperand(1)), "Unsupported: cluster size must be constant");
21809+
const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
21810+
21811+
IGC_ASSERT_MESSAGE(clusterSize <= numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller or equal to SIMD");
21812+
IGC_ASSERT_MESSAGE(clusterSize % 8 == 0, "cluster size must be 8/16/32");
21813+
21814+
bool disableHelperLanes = int_cast<int>(cast<ConstantInt>(inst->getArgOperand(2))->getSExtValue()) == 2;
21815+
if (disableHelperLanes)
2178421816
{
21785-
m_encoder->Cast(m_destination, destination);
21817+
ForceDMask();
21818+
}
21819+
21820+
// Run ballot.
21821+
CVariable* ballotResult = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, "ballotResult");
21822+
emitBallotUniform(inst, &ballotResult, disableHelperLanes);
21823+
21824+
// In case cluster takes full SIMD size, then just propagate result.
21825+
if (clusterSize == numLanes(m_currShader->m_dispatchSize))
21826+
{
21827+
m_encoder->Copy(m_destination, ballotResult);
21828+
if (m_currShader->m_numberInstance > 1)
21829+
{
21830+
m_encoder->SetSecondHalf(true);
21831+
m_encoder->Copy(m_destination, ballotResult);
21832+
m_encoder->SetSecondHalf(false);
21833+
}
21834+
m_encoder->Push();
21835+
return;
21836+
}
21837+
21838+
// ballotResult contains result from all lanes. Cluster can be either 8 or 16 lanes, so clusters in
21839+
// ballotResult are byte-aligned. Extract clusters from the result.
21840+
21841+
CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
21842+
m_encoder->Copy(m_destination, zero);
21843+
if (m_currShader->m_numberInstance > 1)
21844+
{
21845+
m_encoder->SetSecondHalf(true);
21846+
m_encoder->Copy(m_destination, zero);
21847+
m_encoder->SetSecondHalf(false);
21848+
}
21849+
m_encoder->Push();
21850+
21851+
if (clusterSize == 8)
21852+
{
21853+
CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_B, 0, 4, false);
21854+
CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_B, 0, numLanes(m_currShader->m_SIMDSize) * 4);
21855+
21856+
m_encoder->SetSrcRegion(0, 1, 8, 0);
21857+
m_encoder->SetDstRegion(4);
21858+
m_encoder->Copy(dstAlias, ballotAlias);
21859+
if (m_currShader->m_numberInstance > 1)
21860+
{
21861+
m_encoder->SetSecondHalf(true);
21862+
m_encoder->SetSrcSubReg(0, 2);
21863+
m_encoder->Copy(dstAlias, ballotAlias);
21864+
m_encoder->SetSecondHalf(false);
21865+
}
21866+
m_encoder->Push();
21867+
}
21868+
else if (clusterSize == 16)
21869+
{
21870+
CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_UW, 0, 2, false);
21871+
CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UW, 0, numLanes(m_currShader->m_SIMDSize) * 2);
21872+
21873+
m_encoder->SetSrcRegion(0, 1, 16, 0);
21874+
m_encoder->SetDstRegion(2);
21875+
m_encoder->Copy(dstAlias, ballotAlias);
21876+
if (m_currShader->m_numberInstance > 1)
21877+
{
21878+
m_encoder->SetSecondHalf(true);
21879+
m_encoder->SetSrcSubReg(0, 1);
21880+
m_encoder->Copy(dstAlias, ballotAlias);
21881+
m_encoder->SetSecondHalf(false);
21882+
}
2178621883
m_encoder->Push();
2178721884
}
21885+
2178821886
if (disableHelperLanes)
2178921887
{
2179021888
ResetVMask();

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,8 @@ class EmitPass : public llvm::FunctionPass
467467

468468
// CrossLane Instructions
469469
void emitWaveBallot(llvm::GenIntrinsicInst* inst);
470+
void emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst);
471+
void emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable** destination, bool disableHelperLanes);
470472
void emitWaveInverseBallot(llvm::GenIntrinsicInst* inst);
471473
void emitWaveShuffleIndex(llvm::GenIntrinsicInst* inst);
472474
void emitWavePrefix(llvm::WavePrefixIntrinsic* I);

IGC/Compiler/CISACodeGen/PatternMatchPass.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,7 @@ namespace IGC
13721372
break;
13731373
case GenISAIntrinsic::GenISA_WaveBallot:
13741374
case GenISAIntrinsic::GenISA_WaveInverseBallot:
1375+
case GenISAIntrinsic::GenISA_WaveClusteredBallot:
13751376
case GenISAIntrinsic::GenISA_WaveAll:
13761377
case GenISAIntrinsic::GenISA_WaveClustered:
13771378
case GenISAIntrinsic::GenISA_WaveInterleave:
@@ -5293,6 +5294,7 @@ namespace IGC
52935294
switch (I.getIntrinsicID())
52945295
{
52955296
case GenISAIntrinsic::GenISA_WaveAll:
5297+
case GenISAIntrinsic::GenISA_WaveClusteredBallot:
52965298
helperLaneIndex = 2;
52975299
break;
52985300
case GenISAIntrinsic::GenISA_WaveBallot:

IGC/Compiler/CISACodeGen/WIAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,6 +1413,7 @@ WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst)
14131413
intrinsic_name == llvm_waveBroadcast ||
14141414
intrinsic_name == llvm_waveClusteredBroadcast ||
14151415
intrinsic_name == llvm_waveBallot ||
1416+
intrinsic_name == llvm_waveClusteredBallot ||
14161417
intrinsic_name == llvm_waveAll ||
14171418
intrinsic_name == llvm_waveClustered ||
14181419
intrinsic_name == llvm_waveInterleave ||

IGC/Compiler/CISACodeGen/helper.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,7 @@ namespace IGC
18851885
opcode == llvm_waveBroadcast ||
18861886
opcode == llvm_waveClusteredBroadcast ||
18871887
opcode == llvm_waveBallot ||
1888+
opcode == llvm_waveClusteredBallot ||
18881889
opcode == llvm_simdShuffleDown ||
18891890
opcode == llvm_simdBlockRead||
18901891
opcode == llvm_simdBlockReadBindless);

IGC/Compiler/CISACodeGen/opCode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ DECLARE_OPCODE(GenISA_pair_to_ptr, GenISAIntrinsic, llvm_pair_to_ptr, false, fal
281281

282282
// Wave intrinsics
283283
DECLARE_OPCODE(GenISA_WaveBallot, GenISAIntrinsic, llvm_waveBallot, false, false, false, false, false, false, false)
284+
DECLARE_OPCODE(GenISA_WaveClusteredBallot, GenISAIntrinsic, llvm_waveClusteredBallot, false, false, false, false, false, false, false)
284285
DECLARE_OPCODE(GenISA_WaveAll, GenISAIntrinsic, llvm_waveAll, false, false, false, false, false, false, false)
285286
DECLARE_OPCODE(GenISA_WaveClustered, GenISAIntrinsic, llvm_waveClustered, false, false, false, false, false, false, false)
286287
DECLARE_OPCODE(GenISA_WaveInterleave, GenISAIntrinsic, llvm_waveInterleave, false, false, false, false, false, false, false)

IGC/Compiler/Optimizer/OCLBIUtils.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,12 @@ class CWaveBallotIntrinsic : public CCommand
12061206
}
12071207

12081208
m_args.push_back(truncInst);
1209+
1210+
if (isaId == GenISAIntrinsic::GenISA_WaveClusteredBallot)
1211+
{
1212+
m_args.push_back(m_pCallInst->getArgOperand(1));
1213+
}
1214+
12091215
m_args.push_back(IRB.getInt32(0));
12101216
replaceGenISACallInst(isaId);
12111217
}
@@ -1761,6 +1767,7 @@ CBuiltinsResolver::CBuiltinsResolver(CImagesBI::ParamMap* paramMap, CImagesBI::I
17611767

17621768
// Ballot builtins
17631769
m_CommandMap["__builtin_IB_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveBallot);
1770+
m_CommandMap["__builtin_IB_clustered_WaveBallot"] = CWaveBallotIntrinsic::create(GenISAIntrinsic::GenISA_WaveClusteredBallot);
17641771

17651772
m_CommandMap[StringRef("__builtin_IB_samplepos")] = CSamplePos::create();
17661773

0 commit comments

Comments
 (0)