@@ -5763,20 +5763,62 @@ void EmitPass::emitSimdClusteredBroadcast(llvm::Instruction* inst)
57635763 IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(inst->getOperand(2)), "Unsupported: cluster lane must be constant");
57645764 const unsigned int clusterLane = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
57655765
5766- IGC_ASSERT_MESSAGE(clusterSize < numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller than SIMD");
5767- IGC_ASSERT_MESSAGE(clusterSize == 8 || clusterSize == 16 , "cluster size must be 8 or 16");
5766+ IGC_ASSERT_MESSAGE(clusterSize <= numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller or equal to SIMD");
5767+ IGC_ASSERT_MESSAGE(clusterSize % 8 == 0 , "cluster size must be 8 or 16");
57685768 IGC_ASSERT_MESSAGE(clusterLane < clusterSize, "cluster lane does not fit in cluster size");
57695769
5770- m_encoder->SetSrcRegion(0, clusterSize, clusterSize, 0);
5771- m_encoder->SetSrcSubReg(0, clusterLane);
5772- m_encoder->Copy(m_destination, data);
5773- if (m_currShader->m_numberInstance > 1)
5770+ if (clusterSize == numLanes(m_currShader->m_dispatchSize))
57745771 {
5775- m_encoder->SetSecondHalf(true);
5772+ // There is actually no cluster, just do subgroup broadcast instead.
5773+ if (m_currShader->m_numberInstance > 1)
5774+ {
5775+ // Use an intermediate uniform variable
5776+ CVariable* uniformTemp = m_currShader->GetNewVariable(
5777+ 1,
5778+ data->GetType(),
5779+ m_encoder->GetCISADataTypeAlignment(data->GetType()),
5780+ true, // isUniform
5781+ "ClusteredBroadcastTmp");
5782+
5783+ // Copy from source to the uniform temp...
5784+ m_encoder->SetSecondHalf(clusterLane >= 16);
5785+ m_encoder->SetNoMask();
5786+ m_encoder->SetSrcRegion(0, 0, 1, 0);
5787+ m_encoder->SetSrcSubReg(0, clusterLane % numLanes(m_encoder->GetSimdSize()));
5788+ m_encoder->Copy(uniformTemp, data);
5789+ m_encoder->Push();
5790+ m_encoder->SetSecondHalf(false);
5791+
5792+ // ...and broadcast.
5793+ m_encoder->Copy(m_destination, uniformTemp);
5794+ m_encoder->Push();
5795+ m_encoder->SetSecondHalf(true);
5796+ m_encoder->Copy(m_destination, uniformTemp);
5797+ m_encoder->SetSecondHalf(false);
5798+ }
5799+ else
5800+ {
5801+ m_encoder->SetSrcRegion(0, 0, 1, 0);
5802+ m_encoder->SetSrcSubReg(0, clusterLane);
5803+ m_encoder->Copy(m_destination, data);
5804+ m_encoder->Push();
5805+ }
5806+ }
5807+ else
5808+ {
5809+ // Clustered broadcast.
5810+ m_encoder->SetSrcRegion(0, clusterSize, clusterSize, 0);
5811+ m_encoder->SetSrcSubReg(0, clusterLane);
57765812 m_encoder->Copy(m_destination, data);
5777- m_encoder->SetSecondHalf(false);
5813+ if (m_currShader->m_numberInstance > 1)
5814+ {
5815+ m_encoder->SetSecondHalf(true);
5816+ m_encoder->Copy(m_destination, data);
5817+ m_encoder->SetSecondHalf(false);
5818+ }
5819+ m_encoder->Push();
57785820 }
5779- m_encoder->Push();
5821+
57805822}
57815823
57825824void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
0 commit comments