Skip to content

Commit f3b28a8

Browse files
aratajewigcbot
authored andcommitted
Fix handling of GenISA_WaveBroadcast
This commit introduces the following changes: 1) The logic that forces `localID` argument of `GenISA_WaveBroadcast` gets removed from `WIAnalysis`. `WIAnalysis` is a pass that analyses instructions' uniformity and tries to propage it top-down. Forcing a single instruction, that is used as local ID argument, is not consistent with top-down nature of `WIAnalysis`, therefore it may potentialy cause unexpected issues, ex. a load instruction's destination is uniform while the pointer it uses is non-uniform. 2) `emitSimdShuffle` function now takes advantage of restrictions of OpGroupBroadcast. OpGroupBroadcast guarantees that all channels must be enabled and the `localID` value must be the same for all of them. Therefore, even though it was not possible to deduce, during compilation time, that `localID` is uniform, `emitSimdShuffle` forces it to be uniform by taking it always from the first channel. 3) Adds `ocloc_test` for `sub_group_broadcast` and `sub_group_non_uniform_broadcast`.
1 parent afba9d6 commit f3b28a8

File tree

4 files changed

+219
-38
lines changed

4 files changed

+219
-38
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 77 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5404,19 +5404,58 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
54045404
else
54055405
{
54065406
// Emits below instructions when simdChannel isn't immediate.
5407-
//shl (16) r8.0<1>:ud r6.0<0;1,0>:d 0x2:uw {Align1, H1, NoMask}
5408-
//add (16) a0.0<1>:uw r8.0<16;8,2>:uw 0x80:uw {Align1, H1, NoMask}
5409-
//mov (16) r10.0<1>:d r[a0.0, 0]<1,0>:d {Align1, H1}
5410-
// For SIMD32:
5411-
// shl(M1, 32) V465(0, 0)<1> V464(0, 0)<16; 8, 2> 0x2:uw /// $592
5412-
// mov(M1, 32) V466(0, 0)<1> V70(0, 0)<1; 1, 0> /// $593
5413-
// addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1; 1, 0> /// $594
5414-
// mov(M1, 16) V463(0, 0)<1> r[A0(0), 0]<1, 0> : f /// $595
5415-
// addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0, 16)<1; 1, 0> /// $596
5416-
// mov(M5, 16) V463(1, 0)<1> r[A0(0), 0]<1, 0> : f /// $597
5407+
//
5408+
// 1) GenISA_WaveShuffleIndex:
5409+
//
5410+
// a) SIMD16
5411+
// shl (M1, 16) ShuffleTmp(0,0)<1> {{.+}}(0,0)<16;8,2> 0x2:uw
5412+
// addr_add (M1, 16) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<1;1,0>
5413+
// mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
5414+
//
5415+
// b) SIMD32 (two SIMD16 ADDR_ADD instructions must be generated,
5416+
// because address register has only 16 elements):
5417+
// shl(M1, 32) V465(0,0)<1> V464(0,0)<16;8,2> 0x2:uw
5418+
// mov(M1, 32) V466(0,0)<1> V70(0,0)<1;1,0>
5419+
// addr_add(M1, 16) A0(0)<1> &V466 + 0 V465(0, 0)<1;1,0>
5420+
// mov(M1, 16) V463(0,0)<1> r[A0(0),0]<1, 0>:f
5421+
// addr_add(M5, 16) A0(0)<1> &V466 + 0 V465(0,16)<1;1,0>
5422+
// mov(M5, 16) V463(1,0)<1> r[A0(0),0]<1,0>:f
5423+
//
5424+
// 2) GenISA_WaveBroadcast:
5425+
//
5426+
// shl (M1_NM, 1) ShuffleTmp(0,0)<1> {{.+}}(0,0)<0;1,0> 0x2:uw
5427+
// addr_add(M1_NM, 1) A0(0) <1> &{{V[0 - 9]+}} ShuffleTmp(0, 0) < 0;1,0 >
5428+
// a) SIMD16:
5429+
// mov(M1, 16) simdBroadcast(0,0) <1> r[A0(0),0] <0;1,0>:d
5430+
// b) SIMD32 (no need for two SIMD16 instructions, because offset in A0 is uniform):
5431+
// mov(M1, 32) simdBroadcast(0,0) <1> r[A0(0),0] <0;1,0>:d
54175432

54185433
bool channelUniform = simdChannel->IsUniform();
54195434

5435+
auto* GII = dyn_cast<GenIntrinsicInst>(inst);
5436+
if (GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveBroadcast &&
5437+
!channelUniform)
5438+
{
5439+
// OpGroupBroadcast guarantees that all channels must be enabled and the
5440+
// simdChannel value must be the same for all of them. Therefore, even though
5441+
// it was not possible to deduce, during compilation time, that simdChannel is
5442+
// uniform, let's force it to be uniform by taking it always from the first channel.
5443+
CVariable* valueFromFirstChannel = m_currShader->GetNewVariable(
5444+
numLanes(SIMDMode::SIMD1),
5445+
simdChannel->GetType(),
5446+
simdChannel->GetAlign(), true, CName::NONE);
5447+
5448+
m_encoder->SetSimdSize(SIMDMode::SIMD1);
5449+
m_encoder->SetNoMask();
5450+
m_encoder->SetSrcRegion(0, 0, 1, 0);
5451+
5452+
m_encoder->Copy(valueFromFirstChannel, simdChannel);
5453+
m_encoder->Push();
5454+
5455+
simdChannel = valueFromFirstChannel;
5456+
channelUniform = true;
5457+
}
5458+
54205459
IGC_ASSERT_MESSAGE(m_encoder->GetCISADataTypeSize(simdChannel->GetType()) == 4,
54215460
"simdChannel size of simdShuffle should be 4 bytes!");
54225461

@@ -5450,7 +5489,6 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
54505489
CVariable* src = data;
54515490
if (m_currShader->m_numberInstance == 1 && m_currShader->m_SIMDSize == SIMDMode::SIMD32)
54525491
{
5453-
54545492
uint16_t addrSize = channelUniform ? 1 : numLanes(SIMDMode::SIMD16);
54555493

54565494
// VectorUniform for shuffle is true as all simd lanes will
@@ -5462,34 +5500,43 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
54625500
true,
54635501
m_destination->getName());
54645502

5465-
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5466-
5467-
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5468-
m_encoder->Push();
5469-
5470-
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5471-
5472-
m_encoder->Copy(m_destination, pDstArrElm);
5473-
m_encoder->Push();
5474-
5475-
// If destination is uniform, don't execute second half.
5476-
if (!channelUniform && !m_destination->IsUniform())
5503+
if (GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveBroadcast)
5504+
{
5505+
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5506+
m_encoder->Push();
5507+
m_encoder->Copy(m_destination, pDstArrElm);
5508+
m_encoder->Push();
5509+
}
5510+
else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
54775511
{
5478-
54795512
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5480-
m_encoder->SetMask(EMASK_H2);
5481-
m_encoder->SetSrcSubReg(0, 16);
5482-
m_encoder->SetSrcSubReg(1, 16);
5513+
54835514
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
54845515
m_encoder->Push();
54855516

54865517
m_encoder->SetSimdSize(SIMDMode::SIMD16);
54875518

5488-
m_encoder->SetMask(EMASK_H2);
5489-
m_encoder->SetDstSubReg(16);
54905519
m_encoder->Copy(m_destination, pDstArrElm);
54915520
m_encoder->Push();
5492-
m_encoder->SetSecondHalf(false);
5521+
5522+
if (!channelUniform)
5523+
{
5524+
5525+
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5526+
m_encoder->SetMask(EMASK_H2);
5527+
m_encoder->SetSrcSubReg(0, 16);
5528+
m_encoder->SetSrcSubReg(1, 16);
5529+
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5530+
m_encoder->Push();
5531+
5532+
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5533+
5534+
m_encoder->SetMask(EMASK_H2);
5535+
m_encoder->SetDstSubReg(16);
5536+
m_encoder->Copy(m_destination, pDstArrElm);
5537+
m_encoder->Push();
5538+
m_encoder->SetSecondHalf(false);
5539+
}
54935540
}
54945541
if (disableHelperLanes)
54955542
{

IGC/Compiler/CISACodeGen/WIAnalysis.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -864,14 +864,6 @@ void WIAnalysisRunner::calculate_dep(const Value* val)
864864
m_CGCtx->EmitWarning(msg.c_str());
865865
}
866866
#endif // DEBUG
867-
// If the local ID comes directly from a load instruction, then we can't mark
868-
// it as uniform, because we'll end up with a load instruction that has a uniform
869-
// `dst` and a non-uniform `src`. Such a case cannot be handled properly by
870-
// EmitVISAPass as we don't know which channel `src` should be taken from.
871-
if (!isa<LoadInst>(inst))
872-
{
873-
dep = WIAnalysis::UNIFORM_THREAD;
874-
}
875867
}
876868

877869
// If the value was changed in this calculation
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2024 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
// UNSUPPORTED: sys32
10+
// REQUIRES: regkeys, pvc-supported
11+
12+
// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s
13+
14+
__attribute__((intel_reqd_sub_group_size(16)))
15+
kernel void test_sub_group_broadcast_non_immediate_sub_group_local_id_simd16(global int* in, global int* ids, global int* out) {
16+
size_t gid = get_global_id(0);
17+
int x = in[gid];
18+
uint which_sub_group_local_id = ids[gid];
19+
// CHECK-LABEL: .kernel "test_sub_group_broadcast_non_immediate_sub_group_local_id_simd16"
20+
// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> {{.+}}(0,0)<0;1,0> 0x2:uw
21+
// CHECK: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
22+
// CHECK: mov (M1, 16) simdBroadcast(0,0)<1> r[A0(0),0]<0;1,0>:d
23+
// CHECK: lsc_store.ugm (M1, 16) flat[{{.+}}]:a64 simdBroadcast:d32
24+
out[gid] = sub_group_broadcast(x, which_sub_group_local_id);
25+
}
26+
27+
__attribute__((intel_reqd_sub_group_size(32)))
28+
kernel void test_sub_group_broadcast_non_immediate_sub_group_local_id_simd32(global int* in, global int* ids, global int* out) {
29+
size_t gid = get_global_id(0);
30+
int x = in[gid];
31+
uint which_sub_group_local_id = ids[gid];
32+
// CHECK-LABEL: .kernel "test_sub_group_broadcast_non_immediate_sub_group_local_id_simd32"
33+
// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> {{.+}}(0,0)<0;1,0> 0x2:uw
34+
// CHECK: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
35+
// CHECK: mov (M1, 32) simdBroadcast(0,0)<1> r[A0(0),0]<0;1,0>:d
36+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdBroadcast:d32
37+
out[gid] = sub_group_broadcast(x, which_sub_group_local_id);
38+
}
39+
40+
__attribute__((intel_reqd_sub_group_size(16)))
41+
kernel void test_sub_group_broadcast_immediate_sub_group_local_id_simd16(global int* in, global int* ids, global int* out) {
42+
size_t gid = get_global_id(0);
43+
int x = in[gid];
44+
uint which_sub_group_local_id = 15;
45+
// CHECK-LABEL: .kernel "test_sub_group_broadcast_immediate_sub_group_local_id_simd16"
46+
// CHECK: mov (M1_NM, 1) simdBroadcast(0,0)<1> {{V[0-9]+}}(0,15)<0;1,0>
47+
// CHECK: mov (M1, 16) simdBroadcastBroadcast(0,0)<1> simdBroadcast(0,0)<0;1,0>
48+
// CHECK: lsc_store.ugm (M1, 16) flat[{{.+}}]:a64 simdBroadcastBroadcast:d32
49+
out[gid] = sub_group_broadcast(x, which_sub_group_local_id);
50+
}
51+
52+
__attribute__((intel_reqd_sub_group_size(32)))
53+
kernel void test_sub_group_broadcast_immediate_sub_group_local_id_simd32(global int* in, global int* ids, global int* out) {
54+
size_t gid = get_global_id(0);
55+
int x = in[gid];
56+
uint which_sub_group_local_id = 31;
57+
// CHECK-LABEL: .kernel "test_sub_group_broadcast_immediate_sub_group_local_id_simd32"
58+
// CHECK: mov (M5_NM, 1) simdBroadcast(0,0)<1> {{V[0-9]+}}(1,15)<0;1,0>
59+
// CHECK: mov (M1, 32) simdBroadcastBroadcast(0,0)<1> simdBroadcast(0,0)<0;1,0>
60+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdBroadcastBroadcast:d32
61+
out[gid] = sub_group_broadcast(x, which_sub_group_local_id);
62+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2024 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
// UNSUPPORTED: sys32
10+
// REQUIRES: regkeys, pvc-supported
11+
12+
// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s
13+
14+
__attribute__((intel_reqd_sub_group_size(16)))
15+
kernel void test_sub_group_non_uniform_broadcast_non_immediate_sub_group_local_id_simd16(global int* in, global int* ids, global int* out) {
16+
size_t gid = get_global_id(0);
17+
int x = in[gid];
18+
uint which_sub_group_local_id = ids[gid];
19+
// CHECK-LABEL: .kernel "test_sub_group_non_uniform_broadcast_non_immediate_sub_group_local_id_simd16"
20+
// CHECK: shl (M1, 16) ShuffleTmp(0,0)<1> {{.+}}(0,0)<16;8,2> 0x2:uw
21+
// CHECK: addr_add (M1, 16) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<1;1,0>
22+
// CHECK: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
23+
// CHECK: lsc_store.ugm (M1, 16) flat[{{.+}}]:a64 simdShuffle:d32
24+
bool isOddLane = get_sub_group_local_id() % 2 == 1;
25+
if (isOddLane)
26+
{
27+
out[gid] = sub_group_non_uniform_broadcast(x, which_sub_group_local_id);
28+
}
29+
}
30+
31+
__attribute__((intel_reqd_sub_group_size(32)))
32+
kernel void test_sub_group_non_uniform_broadcast_non_immediate_sub_group_local_id_simd32(global int* in, global int* ids, global int* out) {
33+
size_t gid = get_global_id(0);
34+
int x = in[gid];
35+
uint which_sub_group_local_id = ids[gid];
36+
// CHECK-LABEL: .kernel "test_sub_group_non_uniform_broadcast_non_immediate_sub_group_local_id_simd32"
37+
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{.+}}(0,0)<16;8,2> 0x2:uw
38+
// CHECK: addr_add (M1, 16) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<1;1,0>
39+
// CHECK: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
40+
// CHECK: addr_add (M5, 16) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,16)<1;1,0>
41+
// CHECK: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d
42+
// CHECK: lsc_store.ugm (M1, 32) flat[V0046]:a64 simdShuffle:d32
43+
bool isOddLane = get_sub_group_local_id() % 2 == 1;
44+
if (isOddLane)
45+
{
46+
out[gid] = sub_group_non_uniform_broadcast(x, which_sub_group_local_id);
47+
}
48+
}
49+
50+
__attribute__((intel_reqd_sub_group_size(16)))
51+
kernel void test_sub_group_non_uniform_broadcast_immediate_sub_group_local_id_simd16(global int* in, global int* ids, global int* out) {
52+
size_t gid = get_global_id(0);
53+
int x = in[gid];
54+
uint which_sub_group_local_id = 15;
55+
// CHECK-LABEL: .kernel "test_sub_group_non_uniform_broadcast_immediate_sub_group_local_id_simd16"
56+
// CHECK: mov (M1_NM, 1) simdShuffle(0,0)<1> {{V[0-9]+}}(0,15)<0;1,0>
57+
// CHECK: mov (M1, 16) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
58+
// CHECK: lsc_store.ugm (M1, 16) flat[{{.+}}]:a64 simdShuffleBroadcast:d32
59+
bool isOddLane = get_sub_group_local_id() % 2 == 1;
60+
if (isOddLane)
61+
{
62+
out[gid] = sub_group_non_uniform_broadcast(x, which_sub_group_local_id);
63+
}
64+
}
65+
66+
__attribute__((intel_reqd_sub_group_size(32)))
67+
kernel void test_sub_group_non_uniform_broadcast_immediate_sub_group_local_id_simd32(global int* in, global int* ids, global int* out) {
68+
size_t gid = get_global_id(0);
69+
int x = in[gid];
70+
uint which_sub_group_local_id = 31;
71+
// CHECK-LABEL: .kernel "test_sub_group_non_uniform_broadcast_immediate_sub_group_local_id_simd32"
72+
// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> {{V[0-9]+}}(1,15)<0;1,0>
73+
// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
74+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffleBroadcast:d32
75+
bool isOddLane = get_sub_group_local_id() % 2 == 1;
76+
if (isOddLane)
77+
{
78+
out[gid] = sub_group_non_uniform_broadcast(x, which_sub_group_local_id);
79+
}
80+
}

0 commit comments

Comments
 (0)