Skip to content

Commit e98892d

Browse files
aratajewigcbot
authored andcommitted
Fix GenISA_WaveShuffleIndex intrinsic if src and dst are the same variables
To handle `GenISA_WaveShuffleIndex` intrisc with non-uniform `simdChannel`, IGC needs to generate two SIMD16 indirectly addressed mov instructions, because address register has only 16 subregisters. If that happens when `GenISA_WaveShuffleIndex` intrinsic uses the same variable as a source, and as a destination, then the first SIMD16 instruction may overwrite values used as a source by the second SIMD16 instruction. Here is the example of an OpenCL C code that reproduces the issue: ```c __attribute__((intel_reqd_sub_group_size(32))) kernel void k(global int* in, global int* ids, uint num_iterations, global int* out) { size_t gid = get_global_id(0); int x = in[gid]; uint which_sub_group_local_id = ids[gid]; for (uint i = 0; i < num_iterations; ++i) { x = intel_sub_group_shuffle(x, which_sub_group_local_id); } out[gid] = x; } ``` This change fixes the issue by writing the result for the first 16 channels into a temporary variable, before executing shuffle index for the last 16 channels
1 parent a7ed19e commit e98892d

File tree

2 files changed

+126
-8
lines changed

2 files changed

+126
-8
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5509,18 +5509,47 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
55095509
}
55105510
else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
55115511
{
5512-
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5512+
if (channelUniform)
5513+
{
5514+
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5515+
m_encoder->Push();
55135516

5514-
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5515-
m_encoder->Push();
5517+
m_encoder->Copy(m_destination, pDstArrElm);
5518+
m_encoder->Push();
5519+
}
5520+
else // !channelUniform
5521+
{
5522+
m_encoder->SetSimdSize(SIMDMode::SIMD16);
55165523

5517-
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5524+
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
5525+
m_encoder->Push();
55185526

5519-
m_encoder->Copy(m_destination, pDstArrElm);
5520-
m_encoder->Push();
5527+
m_encoder->SetSimdSize(SIMDMode::SIMD16);
55215528

5522-
if (!channelUniform)
5523-
{
5529+
// If `src` variable is the same as `m_destination` variable, we cannot write the result
5530+
// for the first 16 channels to `m_destionation` right away, because it may overwrite `src`
5531+
// values used by the last 16 channels. Instead, the result for the first 16 channels
5532+
// gets written into a temporary variable, then shuffle index for the last 16 channels
5533+
// is generated and, at the end, the result for first 16 channels is rewritten from
5534+
// the temporary variable into the `m_destination`.
5535+
const bool isSrcSameAsDst = src == m_destination;
5536+
CVariable* first16LanesResult = nullptr;
5537+
if (isSrcSameAsDst)
5538+
{
5539+
first16LanesResult = m_currShader->GetNewVariable(
5540+
16,
5541+
m_destination->GetType(),
5542+
m_destination->GetAlign(),
5543+
false, // isUniform
5544+
"first16LanesResult");
5545+
5546+
m_encoder->Copy(first16LanesResult, pDstArrElm);
5547+
}
5548+
else
5549+
{
5550+
m_encoder->Copy(m_destination, pDstArrElm);
5551+
}
5552+
m_encoder->Push();
55245553

55255554
m_encoder->SetSimdSize(SIMDMode::SIMD16);
55265555
m_encoder->SetMask(EMASK_H2);
@@ -5536,6 +5565,16 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
55365565
m_encoder->Copy(m_destination, pDstArrElm);
55375566
m_encoder->Push();
55385567
m_encoder->SetSecondHalf(false);
5568+
5569+
m_encoder->Push();
5570+
5571+
if (isSrcSameAsDst)
5572+
{
5573+
IGC_ASSERT(first16LanesResult);
5574+
m_encoder->SetSimdSize(SIMDMode::SIMD16);
5575+
m_encoder->Copy(m_destination, first16LanesResult);
5576+
m_encoder->Push();
5577+
}
55395578
}
55405579
}
55415580
if (disableHelperLanes)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2024 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
// UNSUPPORTED: sys32
10+
// REQUIRES: regkeys, pvc-supported
11+
12+
// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s
13+
14+
// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_immediate_index_simd32"
15+
__attribute__((intel_reqd_sub_group_size(32)))
16+
kernel void test_intel_sub_group_shuffle_immediate_index_simd32(global int* in, global int* ids, global int* out) {
17+
size_t gid = get_global_id(0);
18+
int x = in[gid];
19+
20+
// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> V0039(1,15)<0;1,0>
21+
22+
// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
23+
// CHECK: lsc_store.ugm (M1, 32) flat[V0041]:a64 simdShuffleBroadcast:d32
24+
out[gid] = intel_sub_group_shuffle(x, 31);
25+
}
26+
27+
// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32"
28+
__attribute__((intel_reqd_sub_group_size(32)))
29+
kernel void test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32(global int* in, global int* ids, uint which_sub_group_local_id, global int* out) {
30+
size_t gid = get_global_id(0);
31+
int x = in[gid];
32+
33+
// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> which_sub_group_local_id_0(0,0)<0;1,0> 0x2:uw
34+
// CHECK-NEXT: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
35+
// CHECK-NEXT: mov (M1_NM, 1) simdShuffle(0,0)<1> r[A0(0),0]<0;1,0>:d
36+
37+
// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
38+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffleBroadcast:d32
39+
out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
40+
}
41+
42+
// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32"
43+
__attribute__((intel_reqd_sub_group_size(32)))
44+
kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(global int* in, global int* ids, global int* out) {
45+
size_t gid = get_global_id(0);
46+
int x = in[gid];
47+
uint which_sub_group_local_id = ids[gid];
48+
49+
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
50+
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
51+
// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
52+
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
53+
// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d
54+
55+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffle:d32
56+
out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
57+
}
58+
59+
// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32"
60+
__attribute__((intel_reqd_sub_group_size(32)))
61+
kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32(global int* in, global int* ids, uint num_iterations, global int* out) {
62+
size_t gid = get_global_id(0);
63+
int x = in[gid];
64+
uint which_sub_group_local_id = ids[gid];
65+
66+
for (uint i = 0; i < num_iterations; ++i)
67+
{
68+
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
69+
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
70+
// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d
71+
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
72+
// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d
73+
// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0>
74+
x = intel_sub_group_shuffle(x, which_sub_group_local_id);
75+
}
76+
77+
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 [[X]]:d32
78+
out[gid] = x;
79+
}

0 commit comments

Comments
 (0)