Skip to content

Commit 0a90ec2

Browse files
iwwuigcbot
authored andcommitted
Refactor CS SelectWalkOrder and implement new SelectCSWalkOrder pass
New SelectCSWalkOrder pass will be enabled later. Create struct SComputeShaderWalkOrder to store fields for walk order
1 parent d195b1e commit 0a90ec2

File tree

10 files changed

+352
-0
lines changed

10 files changed

+352
-0
lines changed

IGC/Compiler/CISACodeGen/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
2929
"${CMAKE_CURRENT_SOURCE_DIR}/CodeSinking.cpp"
3030
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.cpp"
3131
"${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.cpp"
32+
"${CMAKE_CURRENT_SOURCE_DIR}/CSWalkOrder.cpp"
3233
"${CMAKE_CURRENT_SOURCE_DIR}/DeSSA.cpp"
3334
"${CMAKE_CURRENT_SOURCE_DIR}/DebugInfo.cpp"
3435
"${CMAKE_CURRENT_SOURCE_DIR}/DpasScan.cpp"
@@ -131,6 +132,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
131132
"${CMAKE_CURRENT_SOURCE_DIR}/CodeSinking.hpp"
132133
"${CMAKE_CURRENT_SOURCE_DIR}/ComputeShaderBase.hpp"
133134
"${CMAKE_CURRENT_SOURCE_DIR}/ConstantCoalescing.hpp"
135+
"${CMAKE_CURRENT_SOURCE_DIR}/CSWalkOrder.hpp"
134136
"${CMAKE_CURRENT_SOURCE_DIR}/DeSSA.hpp"
135137
"${CMAKE_CURRENT_SOURCE_DIR}/DebugInfo.hpp"
136138
"${CMAKE_CURRENT_SOURCE_DIR}/DebugInfoData.hpp"
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2022-2024 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#include "common/LLVMWarningsPush.hpp"
10+
#include <llvm/IR/PatternMatch.h>
11+
#include <llvm/Pass.h>
12+
#include <llvm/Support/Debug.h>
13+
#include <llvm/Support/raw_ostream.h>
14+
#include "common/LLVMWarningsPop.hpp"
15+
#include "GenISAIntrinsics/GenIntrinsics.h"
16+
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
17+
#include "Compiler/MetaDataUtilsWrapper.h"
18+
#include "Compiler/CISACodeGen/CSWalkOrder.hpp"
19+
#include "Probe/Assertion.h"
20+
21+
using namespace llvm;
22+
using namespace IGC;
23+
24+
25+
void IGC::overrideWalkOrderKeysInPass(
26+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z,
27+
SComputeShaderWalkOrder& walkOrderStruct,
28+
CodeGenContext* ctx)
29+
{
30+
ThreadIDLayout& threadIDLayout = walkOrderStruct.m_threadIDLayout;
31+
CS_WALK_ORDER& walkOrder = walkOrderStruct.m_walkOrder;
32+
bool& enableHWGenerateLID = walkOrderStruct.m_enableHWGenerateLID;
33+
34+
ModuleMetaData* MMD = ctx->getModuleMetaData();
35+
const CPlatform& platform = ctx->platform;
36+
const CDriverInfo& driverInfo = ctx->m_DriverInfo;
37+
38+
if ((IGC_IS_FLAG_ENABLED(ForceTileY) || MMD->csInfo.forceTileYWalk) &&
39+
platform.supportHWGenerateTID() && driverInfo.SupportHWGenerateTID())
40+
{
41+
threadIDLayout = ThreadIDLayout::TileY;
42+
walkOrder = CS_WALK_ORDER::WO_YXZ;
43+
enableHWGenerateLID = enableHWGenerateLIDInPass(walkOrder, is_pow2_x, is_pow2_y, is_pow2_z);
44+
}
45+
46+
if (MMD->csInfo.walkOrderEnabled)
47+
{
48+
walkOrder = (CS_WALK_ORDER)MMD->csInfo.walkOrderOverride;
49+
enableHWGenerateLID = enableHWGenerateLIDInPass(walkOrder, is_pow2_x, is_pow2_y, is_pow2_z);
50+
}
51+
52+
if (IGC_IS_FLAG_ENABLED(OverrideCsWalkOrderEnable))
53+
{
54+
walkOrder = (CS_WALK_ORDER)IGC_GET_FLAG_VALUE(OverrideCsWalkOrder);
55+
enableHWGenerateLID = enableHWGenerateLIDInPass(walkOrder, is_pow2_x, is_pow2_y, is_pow2_z);
56+
}
57+
58+
if (IGC_IS_FLAG_ENABLED(OverrideCsTileLayoutEnable))
59+
{
60+
threadIDLayout = (ThreadIDLayout)IGC_IS_FLAG_ENABLED(OverrideCsTileLayout);
61+
}
62+
}
63+
64+
bool IGC::enableHWGenerateLIDInPass(
65+
CS_WALK_ORDER walk_order,
66+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z)
67+
{
68+
bool bEnableHWGenerateLID = false;
69+
70+
switch (walk_order)
71+
{
72+
case CS_WALK_ORDER::WO_XYZ:
73+
case CS_WALK_ORDER::WO_YXZ:
74+
bEnableHWGenerateLID = (is_pow2_x && is_pow2_y);
75+
break;
76+
77+
case CS_WALK_ORDER::WO_XZY:
78+
case CS_WALK_ORDER::WO_ZXY:
79+
bEnableHWGenerateLID = (is_pow2_x && is_pow2_z);
80+
break;
81+
82+
case CS_WALK_ORDER::WO_YZX:
83+
case CS_WALK_ORDER::WO_ZYX:
84+
bEnableHWGenerateLID = (is_pow2_y && is_pow2_z);
85+
break;
86+
}
87+
return bEnableHWGenerateLID;
88+
}
89+
90+
Optional<CS_WALK_ORDER>
91+
IGC::selectBestWalkOrderInPass(
92+
ThreadIDLayout Layout,
93+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z)
94+
{
95+
constexpr uint UNDEF = std::numeric_limits<uint>::max();
96+
uint order0 = UNDEF;
97+
uint order1 = UNDEF;
98+
if (Layout == ThreadIDLayout::TileY)
99+
{
100+
IGC_ASSERT(is_pow2_y);
101+
order0 = 1;
102+
order1 = (is_pow2_x ? 0 : (is_pow2_z ? 2 : UNDEF));
103+
}
104+
else
105+
{
106+
//below is from HAS p-code except tileY
107+
//try to find walk_order so that HW can generate LID
108+
if (is_pow2_x)
109+
{
110+
// (pow2,pow2,z) or (pow2,y,pow2) or illegal
111+
order0 = 0;
112+
order1 = (is_pow2_y ? 1 : (is_pow2_z ? 2 : UNDEF));
113+
}
114+
else if (is_pow2_y)
115+
{
116+
// (x,pow2,pow2) or illegal
117+
order0 = 1;
118+
order1 = (is_pow2_z ? 2 : UNDEF);
119+
}
120+
}
121+
122+
if (order1 != UNDEF)
123+
{
124+
// select walkorder
125+
return getWalkOrderInPass(order0, order1);
126+
}
127+
128+
return None;
129+
}
130+
131+
void IGC::setEmitLocalMaskInPass(SGVUsage channelNum, EMIT_LOCAL_MASK& emitMask)
132+
{
133+
//only 4 patterns are supported: None; X; XY; XYZ
134+
switch (channelNum)
135+
{
136+
case THREAD_ID_IN_GROUP_X:
137+
emitMask = (EMIT_LOCAL_MASK::EM_NONE == emitMask) ? EMIT_LOCAL_MASK::EM_X : emitMask;
138+
break;
139+
case THREAD_ID_IN_GROUP_Y:
140+
emitMask = (EMIT_LOCAL_MASK::EM_NONE == emitMask || EMIT_LOCAL_MASK::EM_X == emitMask) ?
141+
EMIT_LOCAL_MASK::EM_XY : emitMask;
142+
break;
143+
case THREAD_ID_IN_GROUP_Z:
144+
emitMask = EMIT_LOCAL_MASK::EM_XYZ;
145+
break;
146+
default:
147+
break;
148+
}
149+
}
150+
151+
//order0: the internal walk dim
152+
//order1: the intermediate walk dim
153+
//e.g.: 1, 0 means, YXZ walkorder
154+
CS_WALK_ORDER IGC::getWalkOrderInPass(uint order0, uint order1)
155+
{
156+
auto getWalkOrderValue = [](uint order0, uint order1) constexpr {
157+
return (order0 << 4 | order1 << 2);
158+
};
159+
160+
switch (getWalkOrderValue(order0, order1))
161+
{
162+
case getWalkOrderValue(0, 1): return CS_WALK_ORDER::WO_XYZ; //012
163+
case getWalkOrderValue(0, 2): return CS_WALK_ORDER::WO_XZY; //021
164+
case getWalkOrderValue(1, 0): return CS_WALK_ORDER::WO_YXZ; //102
165+
case getWalkOrderValue(1, 2): return CS_WALK_ORDER::WO_YZX; //120
166+
case getWalkOrderValue(2, 0): return CS_WALK_ORDER::WO_ZXY; //201
167+
case getWalkOrderValue(2, 1): return CS_WALK_ORDER::WO_ZYX; //210
168+
default:
169+
IGC_ASSERT_MESSAGE(0, "unhandled case!");
170+
return CS_WALK_ORDER::WO_XYZ;
171+
}
172+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2022-2024 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#ifndef __CS_WALK_ORDER_H__
10+
#define __CS_WALK_ORDER_H__
11+
12+
#include <llvm/Pass.h>
13+
#include "Compiler/CISACodeGen/ComputeShaderBase.hpp"
14+
15+
using namespace IGC;
16+
17+
namespace IGC {
18+
19+
void overrideWalkOrderKeysInPass(
20+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z,
21+
SComputeShaderWalkOrder& walkOrderStruct,
22+
CodeGenContext* ctx);
23+
bool enableHWGenerateLIDInPass(
24+
CS_WALK_ORDER walk_order,
25+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z);
26+
llvm::Optional<CS_WALK_ORDER> selectBestWalkOrderInPass(
27+
ThreadIDLayout Layout,
28+
bool is_pow2_x, bool is_pow2_y, bool is_pow2_z);
29+
void setEmitLocalMaskInPass(SGVUsage channelNum, EMIT_LOCAL_MASK& emitMask);
30+
CS_WALK_ORDER getWalkOrderInPass(uint order0, uint order1);
31+
} // End namespace IGC
32+
33+
#endif // __CS_WALK_ORDER_H__

IGC/Compiler/CISACodeGen/ComputeShaderBase.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ namespace IGC
4040
const CodeGenContext* pCtx = GetContext();
4141
const ModuleMetaData* MMD = pCtx->getModuleMetaData();
4242

43+
if (IGC_IS_FLAG_ENABLED(EnableSelectCSWalkOrderPass) &&
44+
pCtx->platform.EnableCSWalkerPass())
45+
{
46+
return;
47+
}
48+
4349
if (MMD->csInfo.neededThreadIdLayout == ThreadIDLayout::QuadTile)
4450
{
4551
m_ThreadIDLayout = ThreadIDLayout::QuadTile;

IGC/Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ namespace IGC
4848
std::vector<const char*> m_VISAAsmToLink;
4949
// Functions that are forced to be direct calls.
5050
std::unordered_set<std::string> m_DirectCallFunctions;
51+
SComputeShaderWalkOrder m_walkOrderStruct;
5152

5253
OpenCLProgramContext(
5354
const COCLBTILayout& btiLayout,

IGC/Compiler/CodeGenPublic.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,16 @@ namespace IGC
12931293
{}
12941294
};
12951295

1296+
struct SComputeShaderWalkOrder
1297+
{
1298+
ThreadIDLayout m_threadIDLayout = ThreadIDLayout::X;
1299+
CS_WALK_ORDER m_walkOrder = CS_WALK_ORDER::WO_XYZ;
1300+
EMIT_LOCAL_MASK m_emitMask = EMIT_LOCAL_MASK::EM_NONE;
1301+
//true if HW generates localIDs and puts them to payload
1302+
//false if SW generates localIDs and prolog kernel loads them from memory
1303+
bool m_enableHWGenerateLID = false;
1304+
};
1305+
12961306
void OptimizeIR(CodeGenContext* ctx);
12971307

12981308
/**

IGC/Compiler/InitializePasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ void initializeHoistFMulInLoopPassPass(llvm::PassRegistry&);
4747
void initializeHandleFRemInstructionsPass(llvm::PassRegistry&);
4848
void initializeDeSSAPass(llvm::PassRegistry&);
4949
void initializeDetectCSWalkOrderPass(llvm::PassRegistry&);
50+
void initializeSelectCSWalkOrderPass(llvm::PassRegistry&);
5051
void initializeDeviceEnqueueFuncsAnalysisPass(llvm::PassRegistry&);
5152
void initializeDeviceEnqueueFuncsResolutionPass(llvm::PassRegistry&);
5253
void initializeDisableLoopUnrollOnRetryPass(llvm::PassRegistry&);
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; REQUIRES: regkeys, llvm-14-plus
9+
;
10+
; RUN: igc_opt -igc-DetectCSWalkOrder -inputcs -regkey EnableDetectCSWalkOrder -S < %s 2>&1 | FileCheck %s
11+
; ------------------------------------------------
12+
; DetectCSWalkOrder
13+
; ------------------------------------------------
14+
15+
define void @main(<8 x i32> %r0, i8* %privateBase) {
16+
; CHECK: %f5 = lshr i16 %f4, 1
17+
; CHECK: %LocalID_Y = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 18)
18+
; CHECK: %LocalID_Z = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 19)
19+
; CHECK: %LocalID_X = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
20+
; CHECK: [[TMP1:%.*]] = shl i32 %LocalID_Y, 4
21+
; CHECK: [[TMP2:%.*]] = shl i32 %LocalID_Z, 8
22+
; CHECK: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
23+
; CHECK: [[TMP4:%.*]] = add i32 %LocalID_X, [[TMP3]]
24+
; CHECK: [[TMP5:%.*]] = add i32 %LocalID_Y, %LocalID_Z
25+
; CHECK: [[TMP6:%.*]] = shl i32 %LocalID_Y, 3
26+
; CHECK: [[TMP7:%.*]] = add i32 [[TMP6]], %LocalID_X
27+
; CHECK: [[TMP8:%.*]] = lshr i32 [[TMP7]], 4
28+
29+
%flat = call i32 @dx.op.flattenedThreadIdInGroup.i32(i32 96)
30+
%f1 = urem i32 %flat, 18
31+
%f2 = udiv i32 %flat, 18
32+
%f3 = lshr i32 %flat, 4
33+
%f4 = trunc i32 %flat to i16
34+
%f5 = lshr i16 %f4, 1
35+
36+
%LocalID_Y = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 18)
37+
%LocalID_Z = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 19)
38+
%LocalID_X = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
39+
%1 = shl i32 %LocalID_Y, 4
40+
%2 = shl i32 %LocalID_Z, 8
41+
%3 = add i32 %1, %2
42+
%4 = add i32 %LocalID_X, %3
43+
44+
%5 = add i32 %LocalID_Y, %LocalID_Z
45+
%6 = shl i32 %5, 8
46+
%7 = add i32 %LocalID_X, %6
47+
48+
%8 = shl i32 %LocalID_Y, 3
49+
%9 = add i32 %8, %LocalID_X
50+
%10 = lshr i32 %9, 4
51+
52+
ret void
53+
}
54+
55+
declare i32 @dx.op.flattenedThreadIdInGroup.i32(i32) #0
56+
declare i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32) #0
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
; REQUIRES: regkeys, llvm-14-plus
9+
;
10+
; RUN: igc_opt -igc-SelectCSWalkOrder --inputcs -regkey EnableSelectCSWalkOrderPass -S < %s 2>&1 | FileCheck %s
11+
; ------------------------------------------------
12+
; SelectCSWalkOrder
13+
; ------------------------------------------------
14+
15+
%__2D_DIM_Resource = type opaque
16+
%dx.types.Handle = type { i8* }
17+
18+
define void @main(<8 x i32> %r0) {
19+
; CHECK: [[TMP1:%.*]] = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 14)
20+
; CHECK: %GroupID_X = bitcast float [[TMP1]] to i32
21+
; CHECK: [[TMP2:%.*]] = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 15)
22+
; CHECK: %GroupID_Y = bitcast float [[TMP2]] to i32
23+
; CHECK: [[TMP3:%.*]] = shl i32 %GroupID_X, 4
24+
; CHECK: [[TMP4:%.*]] = shl i32 %GroupID_Y, 4
25+
; CHECK: %LocalID_Y = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 18)
26+
; CHECK: %ThreadID_Y = add i32 [[TMP4]], %LocalID_Y
27+
28+
%1 = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 14)
29+
%GroupID_X = bitcast float %1 to i32
30+
%2 = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 15)
31+
%GroupID_Y = bitcast float %2 to i32
32+
%3 = call i32 @llvm.genx.GenISA.RuntimeValue.i32(i32 1)
33+
%u0 = inttoptr i32 %3 to %__2D_DIM_Resource addrspace(2490368)*
34+
%4 = shl i32 %GroupID_X, 4
35+
%LocalID_X = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
36+
%ThreadID_X = add i32 %4, %LocalID_X
37+
br label %5
38+
39+
5: ; preds = %5, %0
40+
%6 = phi i32 [ 0, %0 ], [ %9, %5 ]
41+
%7 = load i32, i32 addrspace(3)* null, align 2147483648, !tbaa !390
42+
%8 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* nonnull inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* nonnull inttoptr (i32 4 to i32 addrspace(3)*), i32 %7, i32 0)
43+
store i32 %8, i32 addrspace(3)* inttoptr (i32 8 to i32 addrspace(3)*), align 8, !tbaa !390
44+
%9 = add nuw nsw i32 %6, 1
45+
%10 = icmp eq i32 %9, 10
46+
br i1 %10, label %11, label %5
47+
48+
11: ; preds = %5
49+
%12 = shl i32 %GroupID_Y, 4
50+
%LocalID_Y = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 18)
51+
%ThreadID_Y = add i32 %12, %LocalID_Y
52+
%13 = load i32, i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*), align 16, !tbaa !390
53+
%14 = bitcast i32 %13 to float
54+
%15 = bitcast i32 %13 to float
55+
%16 = bitcast i32 %13 to float
56+
%17 = bitcast i32 %13 to float
57+
call void @llvm.genx.GenISA.typedwrite.p2490368__2D_DIM_Resource(%__2D_DIM_Resource addrspace(2490368)* %u0, i32 %ThreadID_X, i32 %ThreadID_Y, i32 0, i32 0, float %14, float %15, float %16, float %17)
58+
ret void
59+
}
60+
61+
declare i32 @llvm.genx.GenISA.RuntimeValue.i32(i32) #0
62+
declare i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)*, i32 addrspace(3)*, i32, i32) #3
63+
declare void @llvm.genx.GenISA.typedwrite.p2490368__2D_DIM_Resource(%__2D_DIM_Resource addrspace(2490368)*, i32, i32, i32, i32, float, float, float, float) #4
64+
declare float @llvm.genx.GenISA.DCL.SystemValue.f32(i32) #0
65+
declare i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32) #0
66+
67+
!390 = !{!391, !391, i64 0}
68+
!391 = !{!"int", !392, i64 0}
69+
!392 = !{!"omnipotent char", !393, i64 0}
70+
!393 = !{!"Simple C/C++ TBAA"}

0 commit comments

Comments
 (0)