Skip to content

Commit 0ae060e

Browse files
pkwasnie-inteligcbot
authored andcommitted
new intrinsic: sub group clustered broadcast
Adds new intrinsic: sub group clustered broadcast. Initial implementation has a set of restrictions: * Supports only cluster size 8 or 16. * Supports only constant cluster size and cluster line.
1 parent 4ebc428 commit 0ae060e

File tree

13 files changed

+274
-0
lines changed

13 files changed

+274
-0
lines changed

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,15 @@ half __builtin_IB_simd_broadcast_h( half, uint );
571571
double __builtin_IB_simd_broadcast_df( double, uint );
572572
void __builtin_IB_sub_group_barrier();
573573

574+
// SubGroup clustered broadcast - for internal use
575+
uint __builtin_IB_simd_clustered_broadcast( uint, uint, uint );
576+
bool __builtin_IB_simd_clustered_broadcast_b( bool, uint, uint );
577+
uchar __builtin_IB_simd_clustered_broadcast_c( uchar, uint, uint );
578+
ushort __builtin_IB_simd_clustered_broadcast_us( ushort, uint, uint );
579+
float __builtin_IB_simd_clustered_broadcast_f( float, uint, uint );
580+
half __builtin_IB_simd_clustered_broadcast_h( half, uint, uint );
581+
double __builtin_IB_simd_clustered_broadcast_df( double, uint, uint );
582+
574583
// Block read : global address space
575584
uint __builtin_IB_simd_block_read_1_global( const __global uint* );
576585
uint2 __builtin_IB_simd_block_read_2_global( const __global uint* );

IGC/Compiler/CISACodeGen/CodeSinking.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2489,6 +2489,7 @@ namespace IGC {
24892489
// Wave intrinsics
24902490
case GenISAIntrinsic::GenISA_WaveShuffleIndex:
24912491
case GenISAIntrinsic::GenISA_WaveBroadcast:
2492+
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
24922493
case GenISAIntrinsic::GenISA_WaveBallot:
24932494
case GenISAIntrinsic::GenISA_WaveInverseBallot:
24942495
case GenISAIntrinsic::GenISA_WaveAll:

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5732,6 +5732,48 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
57325732
}
57335733
}
57345734

5735+
void EmitPass::emitSimdClusteredBroadcast(llvm::Instruction* inst)
5736+
{
5737+
CVariable* data = GetSymbol(inst->getOperand(0));
5738+
5739+
// If input is uniform, just copy to all lanes.
5740+
if (data->IsUniform())
5741+
{
5742+
m_encoder->Copy(m_destination, data);
5743+
if (!m_destination->IsUniform() && m_currShader->m_numberInstance > 1)
5744+
{
5745+
m_encoder->SetSecondHalf(true);
5746+
m_encoder->Copy(m_destination, data);
5747+
m_encoder->SetSecondHalf(false);
5748+
}
5749+
m_encoder->Push();
5750+
return;
5751+
}
5752+
5753+
IGC_ASSERT_MESSAGE(!m_destination->IsUniform(), "Unsupported: dst must be non-uniform");
5754+
5755+
IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(inst->getOperand(1)), "Unsupported: cluster size must be constant");
5756+
const unsigned int clusterSize = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
5757+
5758+
IGC_ASSERT_MESSAGE(isa<llvm::ConstantInt>(inst->getOperand(2)), "Unsupported: cluster lane must be constant");
5759+
const unsigned int clusterLane = int_cast<uint32_t>(cast<llvm::ConstantInt>(inst->getOperand(2))->getZExtValue());
5760+
5761+
IGC_ASSERT_MESSAGE(clusterSize < numLanes(m_currShader->m_dispatchSize), "cluster size must be smaller than SIMD");
5762+
IGC_ASSERT_MESSAGE(clusterSize == 8 || clusterSize == 16, "cluster size must be 8 or 16");
5763+
IGC_ASSERT_MESSAGE(clusterLane < clusterSize, "cluster lane does not fit in cluster size");
5764+
5765+
m_encoder->SetSrcRegion(0, clusterSize, clusterSize, 0);
5766+
m_encoder->SetSrcSubReg(0, clusterLane);
5767+
m_encoder->Copy(m_destination, data);
5768+
if (m_currShader->m_numberInstance > 1)
5769+
{
5770+
m_encoder->SetSecondHalf(true);
5771+
m_encoder->Copy(m_destination, data);
5772+
m_encoder->SetSecondHalf(false);
5773+
}
5774+
m_encoder->Push();
5775+
}
5776+
57355777
void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
57365778
{
57375779
CVariable* pCurrentData = GetSymbol(inst->getOperand(0));
@@ -9133,6 +9175,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
91339175
case GenISAIntrinsic::GenISA_WaveBroadcast:
91349176
emitSimdShuffle(inst);
91359177
break;
9178+
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
9179+
emitSimdClusteredBroadcast(inst);
9180+
break;
91369181
case GenISAIntrinsic::GenISA_WavePrefix:
91379182
emitWavePrefix(cast<WavePrefixIntrinsic>(inst));
91389183
break;

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ class EmitPass : public llvm::FunctionPass
243243
void emitSimdLaneIdReplicate(llvm::Instruction* inst);
244244
void emitSimdSize(llvm::Instruction* inst);
245245
void emitSimdShuffle(llvm::Instruction* inst);
246+
void emitSimdClusteredBroadcast(llvm::Instruction* inst);
246247
void emitCrossInstanceMov(const SSource& source, const DstModifier& modifier);
247248
void emitSimdShuffleDown(llvm::Instruction* inst);
248249
void emitSimdShuffleXor(llvm::Instruction* inst);

IGC/Compiler/CISACodeGen/PromoteInt8Type.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,7 @@ void PromoteInt8Type::promoteIntrinsic()
11131113
continue;
11141114
if (GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveShuffleIndex) ||
11151115
GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveBroadcast) ||
1116+
GII->isGenIntrinsic(GenISAIntrinsic::GenISA_WaveClusteredBroadcast) ||
11161117
GII->isGenIntrinsic(GenISAIntrinsic::GenISA_simdShuffleDown))
11171118
{
11181119
// Those are mov insts. Need to promote if its operand is
@@ -1166,6 +1167,7 @@ void PromoteInt8Type::promoteIntrinsic()
11661167
gid == GenISAIntrinsic::GenISA_QuadPrefix ||
11671168
gid == GenISAIntrinsic::GenISA_WaveShuffleIndex ||
11681169
gid == GenISAIntrinsic::GenISA_WaveBroadcast ||
1170+
gid == GenISAIntrinsic::GenISA_WaveClusteredBroadcast ||
11691171
gid == GenISAIntrinsic::GenISA_simdShuffleDown)
11701172
{
11711173
//
@@ -1204,10 +1206,12 @@ void PromoteInt8Type::promoteIntrinsic()
12041206
}
12051207
case GenISAIntrinsic::GenISA_WaveClustered:
12061208
case GenISAIntrinsic::GenISA_WaveInterleave:
1209+
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
12071210
{
12081211
// prototype:
12091212
// Ty <clustered> (Ty, char, int, int)
12101213
// Ty <interleave> (Ty, char, int, int)
1214+
// Ty <clusteredbroadcast> (Ty, int, int, int)
12111215
iArgs.push_back(GII->getArgOperand(1));
12121216
iArgs.push_back(GII->getArgOperand(2));
12131217
iArgs.push_back(GII->getArgOperand(3));

IGC/Compiler/CISACodeGen/WIAnalysis.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,6 +1411,7 @@ WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst)
14111411
intrinsic_name == llvm_cycleCounter ||
14121412
intrinsic_name == llvm_waveShuffleIndex ||
14131413
intrinsic_name == llvm_waveBroadcast ||
1414+
intrinsic_name == llvm_waveClusteredBroadcast ||
14141415
intrinsic_name == llvm_waveBallot ||
14151416
intrinsic_name == llvm_waveAll ||
14161417
intrinsic_name == llvm_waveClustered ||

IGC/Compiler/CISACodeGen/helper.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1594,6 +1594,7 @@ namespace IGC
15941594
{
15951595
case GenISAIntrinsic::GenISA_WaveShuffleIndex:
15961596
case GenISAIntrinsic::GenISA_WaveBroadcast:
1597+
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
15971598
case GenISAIntrinsic::GenISA_simdShuffleDown:
15981599
case GenISAIntrinsic::GenISA_simdShuffleXor:
15991600
case GenISAIntrinsic::GenISA_simdBlockRead:
@@ -1882,6 +1883,7 @@ namespace IGC
18821883
opcode == llvm_wavePrefix ||
18831884
opcode == llvm_waveShuffleIndex ||
18841885
opcode == llvm_waveBroadcast ||
1886+
opcode == llvm_waveClusteredBroadcast ||
18851887
opcode == llvm_waveBallot ||
18861888
opcode == llvm_simdShuffleDown ||
18871889
opcode == llvm_simdBlockRead||

IGC/Compiler/CISACodeGen/opCode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ DECLARE_OPCODE(GenISA_WavePrefix, GenISAIntrinsic, llvm_wavePrefix, false, false
289289
DECLARE_OPCODE(GenISA_QuadPrefix, GenISAIntrinsic, llvm_quadPrefix, false, false, false, false, false, false, false)
290290
DECLARE_OPCODE(GenISA_WaveShuffleIndex, GenISAIntrinsic, llvm_waveShuffleIndex, false, false, false, false, false, false, false)
291291
DECLARE_OPCODE(GenISA_WaveBroadcast, GenISAIntrinsic, llvm_waveBroadcast, false, false, false, false, false, false, false)
292+
DECLARE_OPCODE(GenISA_WaveClusteredBroadcast, GenISAIntrinsic, llvm_waveClusteredBroadcast, false, false, false, false, false, false, false)
292293

293294
// Unmasked region
294295
DECLARE_OPCODE(GenISA_UnmaskedRegionBegin, GenISAIntrinsic, llvm_unmaskedBegin, false, false, false, false, false, false, false)

IGC/Compiler/Optimizer/OpenCLPasses/SubGroupFuncs/SubGroupFuncsResolution.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_BROADCAST_US = "__built
5151
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_BROADCAST_F = "__builtin_IB_simd_broadcast_f";
5252
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_BROADCAST_H = "__builtin_IB_simd_broadcast_h";
5353
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_BROADCAST_DF = "__builtin_IB_simd_broadcast_df";
54+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST = "__builtin_IB_simd_clustered_broadcast";
55+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_B = "__builtin_IB_simd_clustered_broadcast_b";
56+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_C = "__builtin_IB_simd_clustered_broadcast_c";
57+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_US = "__builtin_IB_simd_clustered_broadcast_us";
58+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_F = "__builtin_IB_simd_clustered_broadcast_f";
59+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_H = "__builtin_IB_simd_clustered_broadcast_h";
60+
const llvm::StringRef SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_DF = "__builtin_IB_simd_clustered_broadcast_df";
5461
const llvm::StringRef SubGroupFuncsResolution::SIMD_BLOCK_READ_1_GBL = "__builtin_IB_simd_block_read_1_global";
5562
const llvm::StringRef SubGroupFuncsResolution::SIMD_BLOCK_READ_2_GBL = "__builtin_IB_simd_block_read_2_global";
5663
const llvm::StringRef SubGroupFuncsResolution::SIMD_BLOCK_READ_4_GBL = "__builtin_IB_simd_block_read_4_global";
@@ -680,6 +687,41 @@ void SubGroupFuncsResolution::visitCallInst(CallInst& CI)
680687
CI.replaceAllUsesWith(simdBroadcast);
681688
CI.eraseFromParent();
682689
}
690+
else if (funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST) ||
691+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_US) ||
692+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_F) ||
693+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_H) ||
694+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_C) ||
695+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_B) ||
696+
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_CLUSTERED_BROADCAST_DF)
697+
)
698+
{
699+
// Creates intrinsics that will be lowered in the CodeGen and will handle the sub_group_clustered_broadcast function
700+
IRBuilder<> IRB(&CI);
701+
Value* args[4];
702+
args[0] = CI.getArgOperand(0);
703+
args[1] = CI.getArgOperand(1);
704+
args[2] = CI.getArgOperand(2);
705+
args[3] = IRB.getInt32(0);
706+
707+
if (!isa<ConstantInt>(args[1]))
708+
{
709+
m_pCtx->EmitError("cluster_size argument in clustered_broadcast must be constant.", &CI);
710+
return;
711+
}
712+
if (!isa<ConstantInt>(args[2]))
713+
{
714+
m_pCtx->EmitError("in_cluster_lane argument in clustered_broadcast must be constant.", &CI);
715+
return;
716+
}
717+
718+
Function* simdClusteredBroadcastFunc = GenISAIntrinsic::getDeclaration(CI.getCalledFunction()->getParent(),
719+
GenISAIntrinsic::GenISA_WaveClusteredBroadcast, args[0]->getType());
720+
Instruction* simdClusteredBroadcast = CallInst::Create(simdClusteredBroadcastFunc, args, "simdClusteredBroadcast", &CI);
721+
updateDebugLoc(&CI, simdClusteredBroadcast);
722+
CI.replaceAllUsesWith(simdClusteredBroadcast);
723+
CI.eraseFromParent();
724+
}
683725
else if (funcName.equals(SubGroupFuncsResolution::SUB_GROUP_SHUFFLE_DOWN) ||
684726
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_SHUFFLE_DOWN_US) ||
685727
funcName.equals(SubGroupFuncsResolution::SUB_GROUP_SHUFFLE_DOWN_UC))

IGC/Compiler/Optimizer/OpenCLPasses/SubGroupFuncs/SubGroupFuncsResolution.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ namespace IGC
8989
static const llvm::StringRef SUB_GROUP_BROADCAST_C;
9090
static const llvm::StringRef SUB_GROUP_BROADCAST_B;
9191
static const llvm::StringRef SUB_GROUP_BROADCAST_DF;
92+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST;
93+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_US;
94+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_F;
95+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_H;
96+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_C;
97+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_B;
98+
static const llvm::StringRef SUB_GROUP_CLUSTERED_BROADCAST_DF;
9299

93100
static const llvm::StringRef SIMD_BLOCK_READ_1_GBL;
94101
static const llvm::StringRef SIMD_BLOCK_READ_2_GBL;

0 commit comments

Comments
 (0)