@@ -9178,6 +9178,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
91789178 case GenISAIntrinsic::GenISA_WaveInverseBallot:
91799179 emitWaveInverseBallot(inst);
91809180 break;
9181+ case GenISAIntrinsic::GenISA_WaveClusteredBallot:
9182+ emitWaveClusteredBallot(inst);
9183+ break;
91819184 case GenISAIntrinsic::GenISA_WaveShuffleIndex:
91829185 case GenISAIntrinsic::GenISA_WaveBroadcast:
91839186 emitSimdShuffle(inst);
@@ -21726,6 +21729,23 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2172621729 destination = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE);
2172721730 }
2172821731
21732+ emitBallotUniform(inst, &destination, disableHelperLanes);
21733+
21734+ if (destination != m_destination)
21735+ {
21736+ m_encoder->Cast(m_destination, destination);
21737+ m_encoder->Push();
21738+ }
21739+ if (disableHelperLanes)
21740+ {
21741+ ResetVMask();
21742+ }
21743+ }
21744+
21745+ void EmitPass::emitBallotUniform(llvm::GenIntrinsicInst* inst, CVariable** destination, bool disableHelperLanes)
21746+ {
21747+ IGC_ASSERT_MESSAGE((*destination)->IsUniform(), "Unsupported: dst must be uniform");
21748+
2172921749 bool uniform_active_lane = false;
2173021750 if (ConstantInt * pConst = dyn_cast<ConstantInt>(inst->getOperand(0)))
2173121751 {
@@ -21741,17 +21761,17 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2174121761 if (m_currShader->m_dispatchSize == SIMDMode::SIMD8 && m_currShader->HasFullDispatchMask())
2174221762 {
2174321763 // for SIMD8 make sure the higher 8 bits of the flag are not copied
21744- destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
21764+ * destination = m_currShader->GetNewVariable(1, ISA_TYPE_UB, EALIGN_BYTE, true, CName::NONE);
2174521765 }
21746- m_encoder->BoolToInt(destination, f0);
21766+ m_encoder->BoolToInt(* destination, f0);
2174721767 if (!m_currShader->HasFullDispatchMask())
2174821768 {
2174921769 CVariable* dispatchMask = m_currShader->GetNewAlias(
2175021770 m_currShader->GetSR0(),
2175121771 ISA_TYPE_UD,
2175221772 (m_pattern->NeedVMask() && !disableHelperLanes ? 3 : 2) * SIZE_DWORD,
2175321773 1);
21754- m_encoder->And(destination, dispatchMask, destination);
21774+ m_encoder->And(* destination, dispatchMask, * destination);
2175521775 }
2175621776 }
2175721777 else
@@ -21770,21 +21790,99 @@ void EmitPass::emitWaveBallot(llvm::GenIntrinsicInst* inst)
2177021790
2177121791 m_encoder->SetSimdSize(SIMDMode::SIMD1);
2177221792 m_encoder->SetNoMask();
21773- m_encoder->And(destination, exeMask, vf0);
21793+ m_encoder->And(* destination, exeMask, vf0);
2177421794 m_encoder->Push();
2177521795 }
2177621796 else
2177721797 {
21778- m_encoder->Cast(destination, exeMask);
21798+ m_encoder->Cast(* destination, exeMask);
2177921799 m_encoder->Push();
2178021800 }
2178121801 }
21802+ }
2178221803
21783- if (destination != m_destination)
21804+ void EmitPass::emitWaveClusteredBallot(llvm::GenIntrinsicInst* inst)
21805+ {
21806+ IGC_ASSERT_MESSAGE(!m_destination->IsUniform(), "Unsupported: dst must be non-uniform");
21807+
21808+ IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(inst->getOperand(1)), "Unsupported: cluster size must be constant");
21809+ const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
21810+
21811+ IGC_ASSERT_MESSAGE(clusterSize <= numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller or equal to SIMD");
21812+ IGC_ASSERT_MESSAGE(clusterSize % 8 == 0, "cluster size must be 8/16/32");
21813+
21814+ bool disableHelperLanes = int_cast<int>(cast<ConstantInt>(inst->getArgOperand(2))->getSExtValue()) == 2;
21815+ if (disableHelperLanes)
2178421816 {
21785- m_encoder->Cast(m_destination, destination);
21817+ ForceDMask();
21818+ }
21819+
21820+ // Run ballot.
21821+ CVariable* ballotResult = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_GRF, true, "ballotResult");
21822+ emitBallotUniform(inst, &ballotResult, disableHelperLanes);
21823+
21824+ // In case cluster takes full SIMD size, then just propagate result.
21825+ if (clusterSize == numLanes(m_currShader->m_dispatchSize))
21826+ {
21827+ m_encoder->Copy(m_destination, ballotResult);
21828+ if (m_currShader->m_numberInstance > 1)
21829+ {
21830+ m_encoder->SetSecondHalf(true);
21831+ m_encoder->Copy(m_destination, ballotResult);
21832+ m_encoder->SetSecondHalf(false);
21833+ }
21834+ m_encoder->Push();
21835+ return;
21836+ }
21837+
21838+ // ballotResult contains result from all lanes. Cluster can be either 8 or 16 lanes, so clusters in
21839+ // ballotResult are byte-aligned. Extract clusters from the result.
21840+
21841+ CVariable* zero = m_currShader->ImmToVariable(0, ISA_TYPE_UD);
21842+ m_encoder->Copy(m_destination, zero);
21843+ if (m_currShader->m_numberInstance > 1)
21844+ {
21845+ m_encoder->SetSecondHalf(true);
21846+ m_encoder->Copy(m_destination, zero);
21847+ m_encoder->SetSecondHalf(false);
21848+ }
21849+ m_encoder->Push();
21850+
21851+ if (clusterSize == 8)
21852+ {
21853+ CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_B, 0, 4, false);
21854+ CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_B, 0, numLanes(m_currShader->m_SIMDSize) * 4);
21855+
21856+ m_encoder->SetSrcRegion(0, 1, 8, 0);
21857+ m_encoder->SetDstRegion(4);
21858+ m_encoder->Copy(dstAlias, ballotAlias);
21859+ if (m_currShader->m_numberInstance > 1)
21860+ {
21861+ m_encoder->SetSecondHalf(true);
21862+ m_encoder->SetSrcSubReg(0, 2);
21863+ m_encoder->Copy(dstAlias, ballotAlias);
21864+ m_encoder->SetSecondHalf(false);
21865+ }
21866+ m_encoder->Push();
21867+ }
21868+ else if (clusterSize == 16)
21869+ {
21870+ CVariable* ballotAlias = m_currShader->GetNewAlias(ballotResult, ISA_TYPE_UW, 0, 2, false);
21871+ CVariable* dstAlias = m_currShader->GetNewAlias(m_destination, ISA_TYPE_UW, 0, numLanes(m_currShader->m_SIMDSize) * 2);
21872+
21873+ m_encoder->SetSrcRegion(0, 1, 16, 0);
21874+ m_encoder->SetDstRegion(2);
21875+ m_encoder->Copy(dstAlias, ballotAlias);
21876+ if (m_currShader->m_numberInstance > 1)
21877+ {
21878+ m_encoder->SetSecondHalf(true);
21879+ m_encoder->SetSrcSubReg(0, 1);
21880+ m_encoder->Copy(dstAlias, ballotAlias);
21881+ m_encoder->SetSecondHalf(false);
21882+ }
2178621883 m_encoder->Push();
2178721884 }
21885+
2178821886 if (disableHelperLanes)
2178921887 {
2179021888 ResetVMask();
0 commit comments