Skip to content

Commit 70a136e

Browse files
pkwasnie-inteligcbot
authored andcommitted
optimizations to global_id_offset/local_size implicit args
Compute workloads add following implicit arguments: 1. payloadHeader - 8 x i32 packing global_id_offset (3 x i32), local_size (3 x i32) and 2 x i32 reserved. 2. enqueued_local_size - 3 x i32 local_size is never used in favour of enqueued_local_size. In the end, payloadHeader has unused 20 bytes. This change introduces following optimizations: 1. Reduces payloadHeader to 3 x i32, packing only global_id_offset. Controlled with ShortImplicitPayloadHeader, disabled by default. 2. Removes global_id_offset and enqueued_local_size from finalizer and zeinfo if arguments are unused. Controlled with RemoveUnusedIdImplicitArguments, disabled by default.
1 parent c9d0fec commit 70a136e

File tree

11 files changed

+76
-5
lines changed

11 files changed

+76
-5
lines changed

IGC/AdaptorCommon/ImplicitArgs.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ static const std::vector<ImplicitArg> IMPLICIT_ARGS = {
104104

105105
// BufferBoundsChecking
106106
ImplicitArg(ImplicitArg::BUFFER_SIZE, "bufferSize", ImplicitArg::LONG, WIAnalysis::UNIFORM_GLOBAL, 1, ImplicitArg::ALIGN_QWORD, true),
107+
108+
ImplicitArg(ImplicitArg::PAYLOAD_HEADER_SHORT, "payloadHeader", ImplicitArg::INT, WIAnalysis::UNIFORM_WORKGROUP, 3, ImplicitArg::ALIGN_DWORD, true, GenISAIntrinsic::GenISA_getPayloadHeader),
107109
};
108110

109111
ImplicitArg::ImplicitArg(

IGC/AdaptorCommon/ImplicitArgs.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ namespace IGC
130130
// BufferBoundsChecking
131131
BUFFER_SIZE,
132132

133+
PAYLOAD_HEADER_SHORT,
134+
133135
NUM_IMPLICIT_ARGS
134136
};
135137

IGC/Compiler/CISACodeGen/CShader.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2841,6 +2841,7 @@ CVariable* CShader::getOrCreateArgumentSymbol(
28412841
// optimization, with some advanced analysis.
28422842
if (ArgType == ImplicitArg::ArgType::R0 ||
28432843
ArgType == ImplicitArg::ArgType::PAYLOAD_HEADER ||
2844+
ArgType == ImplicitArg::ArgType::PAYLOAD_HEADER_SHORT ||
28442845
ArgType == ImplicitArg::ArgType::WORK_DIM ||
28452846
ArgType == ImplicitArg::ArgType::NUM_GROUPS ||
28462847
ArgType == ImplicitArg::ArgType::GLOBAL_SIZE ||

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22818,6 +22818,7 @@ void EmitPass::emitImplicitArgIntrinsic(llvm::GenIntrinsicInst* I)
2281822818

2281922819
if (IAtype == ImplicitArg::ArgType::R0 ||
2282022820
IAtype == ImplicitArg::ArgType::PAYLOAD_HEADER ||
22821+
IAtype == ImplicitArg::ArgType::PAYLOAD_HEADER_SHORT ||
2282122822
IAtype == ImplicitArg::ArgType::WORK_DIM ||
2282222823
IAtype == ImplicitArg::ArgType::NUM_GROUPS ||
2282322824
IAtype == ImplicitArg::ArgType::GLOBAL_SIZE ||

IGC/Compiler/CISACodeGen/OpenCLKernelCodeGen.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,14 @@ namespace IGC
919919
zebin::PreDefinedAttrGetter::ArgType::local_size, cur_pos, size);
920920
break;
921921
}
922+
case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT: {
923+
// PayloadHeader contains global work offset x,y,z
924+
// global work offset size is int32x3
925+
uint32_t size = iOpenCL::DATA_PARAMETER_DATA_SIZE * 3;
926+
zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
927+
zebin::PreDefinedAttrGetter::ArgType::global_id_offset, payloadPosition, size);
928+
break;
929+
}
922930
case KernelArg::ArgType::IMPLICIT_PRIVATE_BASE:
923931
zebin::ZEInfoBuilder::addPayloadArgumentImplicit(m_kernelInfo.m_zePayloadArgs,
924932
zebin::PreDefinedAttrGetter::ArgType::private_base_stateless,
@@ -1512,9 +1520,11 @@ namespace IGC
15121520
break;
15131521

15141522
case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER:
1523+
case KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT:
15151524
// PayloadHeader contains global work offset x,y,z and local size x,y,z -->
15161525
// total of 6 annotations, 3 of each type
1517-
for (int i = 0; i < 6; ++i)
1526+
// Short PayloadHeader reduces it to only global work offset
1527+
for (int i = 0; i < (type == KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER ? 6 : 3); ++i)
15181528
{
15191529
auto constInput = std::make_unique<iOpenCL::ConstantInputAnnotation>();
15201530

@@ -2444,6 +2454,15 @@ namespace IGC
24442454
arg.getArgType() == KernelArg::ArgType::IMPLICIT_BUFFER_SIZE) &&
24452455
arg.getArg()->use_empty();
24462456

2457+
if (IGC_IS_FLAG_ENABLED(RemoveUnusedIdImplicitArguments))
2458+
{
2459+
IsUnusedArg |=
2460+
(arg.getArgType() == KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER || // contains global_id_offset
2461+
arg.getArgType() == KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT ||
2462+
arg.getArgType() == KernelArg::ArgType::IMPLICIT_ENQUEUED_LOCAL_WORK_SIZE) &&
2463+
arg.getArg()->use_empty();
2464+
}
2465+
24472466
// Runtime Values should not be processed any further. No annotations shall be created for them.
24482467
// Only added to KernelArgs to enforce correct allocation order.
24492468
bool isRuntimeValue = (arg.getArgType() == KernelArg::ArgType::RUNTIME_VALUE);

IGC/Compiler/Optimizer/OpenCLPasses/KernelArgs/KernelArgs.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ KernelArg::ArgType KernelArg::calcArgType(const ImplicitArg& arg) const
253253
return KernelArg::ArgType::IMPLICIT_R0;
254254
case ImplicitArg::PAYLOAD_HEADER:
255255
return KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER;
256+
case ImplicitArg::PAYLOAD_HEADER_SHORT:
257+
return KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT;
256258
case ImplicitArg::PRIVATE_BASE:
257259
return KernelArg::ArgType::IMPLICIT_PRIVATE_BASE;
258260
case ImplicitArg::CONSTANT_BASE:
@@ -807,6 +809,7 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
807809

808810
KernelArg::ArgType::RUNTIME_VALUE,
809811
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER,
812+
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT,
810813

811814
KernelArg::ArgType::PTR_LOCAL,
812815
KernelArg::ArgType::PTR_GLOBAL,
@@ -933,6 +936,7 @@ KernelArgsOrder::KernelArgsOrder(InputType layout)
933936

934937
KernelArg::ArgType::RUNTIME_VALUE,
935938
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER,
939+
KernelArg::ArgType::IMPLICIT_PAYLOAD_HEADER_SHORT,
936940
KernelArg::ArgType::PTR_LOCAL,
937941
KernelArg::ArgType::PTR_GLOBAL,
938942
KernelArg::ArgType::PTR_CONSTANT,

IGC/Compiler/Optimizer/OpenCLPasses/KernelArgs/KernelArgs.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ namespace IGC
4444
R1,
4545

4646
IMPLICIT_PAYLOAD_HEADER, // known as INPUT_HEADER in USC
47+
IMPLICIT_PAYLOAD_HEADER_SHORT, // payload header reduced to 3xi32
4748

4849
PTR_LOCAL,
4950
PTR_GLOBAL,

IGC/Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ void WIFuncResolution::visitCallInst(CallInst& CI)
208208
209209
210210
PayloadHeader:
211+
(Note: PayloadHeader uses 8xi32, but only 3xi32 are used. Unused bytes can be removed.)
211212
212213
-----------------------------------------------------------------------------------------------
213214
| Global | Global | Global | Local | Local | Local | Reserved | Num |
@@ -511,8 +512,9 @@ Value* WIFuncResolution::getGlobalOffset(CallInst& CI)
511512
// Creates:
512513
// %globalOffset = extractelement <8 x i32> %payloadHeader, i32 %dim
513514

515+
auto Ty = IGC_IS_FLAG_ENABLED(ShortImplicitPayloadHeader) ? ImplicitArg::PAYLOAD_HEADER_SHORT : ImplicitArg::PAYLOAD_HEADER;
514516
auto F = CI.getFunction();
515-
Value* V = m_implicitArgs.getImplicitArgValue(*F, ImplicitArg::PAYLOAD_HEADER, m_pMdUtils);
517+
Value* V = m_implicitArgs.getImplicitArgValue(*F, Ty, m_pMdUtils);
516518
IGC_ASSERT(V != nullptr);
517519

518520
Value* dim = CI.getArgOperand(0);

IGC/Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncsAnalysis.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,15 @@ bool WIFuncsAnalysis::runOnFunction(Function& F)
9797
SmallVector<ImplicitArg::ArgType, ImplicitArg::NUM_IMPLICIT_ARGS> implicitArgs;
9898

9999
const bool RequirePayloadHeader = m_ctx->m_DriverInfo.RequirePayloadHeader();
100+
const auto PayloadHeaderType = IGC_IS_FLAG_ENABLED(ShortImplicitPayloadHeader) ? ImplicitArg::PAYLOAD_HEADER_SHORT : ImplicitArg::PAYLOAD_HEADER;
100101

101102
// All OpenCL kernels receive R0 and Payload Header implicitly
102103
if (isEntryFunc(m_pMDUtils, &F))
103104
{
104105
implicitArgs.push_back(ImplicitArg::R0);
105106

106107
if (RequirePayloadHeader)
107-
implicitArgs.push_back(ImplicitArg::PAYLOAD_HEADER);
108+
implicitArgs.push_back(PayloadHeaderType);
108109

109110
if (!m_ctx->platform.isProductChildOf(IGFX_XE_HP_SDV) &&
110111
IGC_IS_FLAG_ENABLED(EnableGlobalStateBuffer))
@@ -141,12 +142,12 @@ bool WIFuncsAnalysis::runOnFunction(Function& F)
141142
}
142143
if (m_hasGlobalOffset && RequirePayloadHeader)
143144
{
144-
implicitArgs.push_back(ImplicitArg::PAYLOAD_HEADER);
145+
implicitArgs.push_back(PayloadHeaderType);
145146
}
146147
}
147148
if (m_hasGlobalOffset && !RequirePayloadHeader)
148149
{
149-
implicitArgs.push_back(ImplicitArg::PAYLOAD_HEADER);
150+
implicitArgs.push_back(PayloadHeaderType);
150151
}
151152
if (m_hasWorkDim)
152153
{
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: llvm-14-plus, regkeys
10+
11+
; RUN: igc_opt --opaque-pointers -igc-wi-func-analysis -regkey ShortImplicitPayloadHeader=0 -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-LONG-PAYLOAD
12+
; RUN: igc_opt --opaque-pointers -igc-wi-func-analysis -regkey ShortImplicitPayloadHeader=1 -S %s | FileCheck %s --check-prefixes=CHECK,CHECK-SHORT-PAYLOAD
13+
14+
; Test switching between long (original) and short implicit payload header.
15+
16+
declare i32 @__builtin_IB_get_local_id_x()
17+
18+
define i32 @foo(i32 %dim) nounwind {
19+
%id = call i32 @__builtin_IB_get_local_id_x()
20+
ret i32 %id
21+
}
22+
23+
!igc.functions = !{!0}
24+
!0 = !{i32 (i32)* @foo, !1}
25+
!1 = !{!2, !3}
26+
!2 = !{!"function_type", i32 0}
27+
!3 = !{!"implicit_arg_desc"}
28+
29+
;CHECK: !{!"implicit_arg_desc", ![[A1:[0-9]+]], ![[A2:[0-9]+]], ![[A4:[0-9]+]], ![[A5:[0-9]+]], ![[A6:[0-9]+]]}
30+
;CHECK: ![[A1]] = !{i32 0}
31+
;CHECK-LONG-PAYLOAD: ![[A2]] = !{i32 1}
32+
;CHECK-SHORT-PAYLOAD: ![[A2]] = !{i32 68}
33+
;CHECK: ![[A4]] = !{i32 7}
34+
;CHECK: ![[A5]] = !{i32 8}
35+
;CHECK: ![[A6]] = !{i32 9}

0 commit comments

Comments
 (0)