Skip to content

Commit c93c673

Browse files
matborzyszkowskiigcbot
authored andcommitted
Add PTL support
Add PTL support
1 parent bea8acd commit c93c673

File tree

112 files changed

+3018
-58
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+3018
-58
lines changed

IGC/AdaptorCommon/API/igc.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,5 +206,20 @@ typedef enum
206206
FCEXP_DISABLED = FCEXP_TOBE_DESIGNED
207207
} FCEXP_FLAG_t;
208208

209+
//////////////////////////////////////////////////////////////////////////
210+
/// @brief Structure for passing precompiled LLVM bytecode to IGC.
211+
namespace IGC
212+
{
213+
struct BIFModule
214+
{
215+
uint64_t m_ByteCodeSize = 0;
216+
const void* m_pLLVMBytecode = nullptr;
217+
218+
// These bits are opaque to the IGC.
219+
// They can be used to provide configuration data for
220+
// the function(s) in the LLVM from the bytecode.
221+
uint64_t m_ConfigBits = 0;
222+
};
223+
}
209224
#endif // __IGC_H
210225

IGC/AdaptorCommon/RayTracing/API/BVHInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@ namespace IGC
2121
bool hasFixedOffset = false;
2222
size_t offset = 0;
2323

24+
bool uses64Bit = false;
2425
inline bool operator==(const BVHInfo& RHS) const
2526
{
2627
return (
28+
uses64Bit == RHS.uses64Bit &&
2729
hasFixedOffset == RHS.hasFixedOffset &&
2830
offset == RHS.offset
2931
);

IGC/AdaptorCommon/RayTracing/API/RayDispatchGlobalData.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,11 +232,65 @@ struct RayDispatchGlobalData
232232
uint32_t bvhLevels : 3;
233233
uint32_t MBZ3 : 29;
234234
} rt_data_info;
235+
// In addition to the dword of padding to align `common`, we also
236+
// add 8 dwords so Xe and Xe3 both have the same RTGlobals size.
235237
uint32_t paddingBits[1+6]; // padding
236238

237239
// HW doesn't read anything below this point.
238240
RayDispatchGlobalDataCommon common;
239241
} xe;
242+
struct Xe3
243+
{
244+
template<class TAPIAdaptor>
245+
void populate(const TAPIAdaptor& umd)
246+
{
247+
rtMemBasePtr = umd.GetRayStackBufferAddress();
248+
callStackHandlerPtr = umd.GetCallStackHandlerPtr();
249+
stack_size_info.stackSizePerRay = umd.GetStackSizePerRay();
250+
num_stacks_info.numRTStacks = umd.GetNumDSSRTStacks();
251+
252+
// _pad1_mbz higher 16 bits must be zero.
253+
num_stacks_info.numRTStacks = (num_stacks_info.numRTStacks & 0x0000FFFF);
254+
255+
constexpr uint32_t strideMask = (1 << 13) - 1;
256+
const uint32_t hgs = umd.GetHitGroupStride() & strideMask;
257+
const uint32_t mss = umd.GetMissStride() & strideMask;
258+
rt_data_info.packedData = (umd.GetMaxBVHLevels() << 0) | (hgs << 3) | (mss << 16);
259+
260+
hitGroupBasePtr = umd.GetHitGroupTable();
261+
missShaderBasePtr = umd.GetMissShaderTable();
262+
263+
common.populate(umd);
264+
}
265+
266+
uint64_t rtMemBasePtr; // base address of the allocated stack memory
267+
uint64_t callStackHandlerPtr; // this is the KSP of the continuation handler that is invoked by BTD when the read KSP is 0
268+
union {
269+
uint32_t stackSizePerRay; // async-RT stack size in 64 byte blocks
270+
uint32_t _pad0_mbz : 32;
271+
} stack_size_info;
272+
union {
273+
uint32_t numRTStacks; // number of stacks per DSS
274+
uint32_t numDSSRTStacks : 16; // number of asynch stacks per DSS
275+
uint32_t _pad1_mbz : 16;
276+
277+
} num_stacks_info;
278+
union {
279+
uint32_t packedData;
280+
uint32_t maxBVHLevels : 3; // the maximal number of supported instancing levels (0->8, 1->1, 2->2, ...)
281+
uint32_t hitGroupStride : 13; // stride of hit group shader records (16-bytes alignment)
282+
uint32_t missShaderStride : 13; // stride of miss shader records (8-bytes alignment)
283+
uint32_t _pad2_mbz : 3;
284+
} rt_data_info;
285+
uint32_t flags : 1; // per context control flags
286+
uint32_t pad_mbz : 31;
287+
uint64_t hitGroupBasePtr; // base pointer of hit group shader record array (16-bytes alignment)
288+
uint64_t missShaderBasePtr; // base pointer of miss shader record array (8-bytes alignment)
289+
uint32_t _align_mbz[2]; // pad hardware section to 64 bytes
290+
291+
// HW doesn't read anything below this point.
292+
RayDispatchGlobalDataCommon common;
293+
} xe3;
240294
} rt;
241295
};
242296

@@ -251,6 +305,8 @@ static_assert(RTStackAlign % RayDispatchGlobalData::StackChunkSize == 0, "no?");
251305

252306
static_assert(sizeof(RayDispatchGlobalData) == 184, "unexpected size?");
253307
static_assert(sizeof(RayDispatchGlobalData::RT::Xe) == sizeof(RayDispatchGlobalData), "unexpected size?");
308+
static_assert(sizeof(RayDispatchGlobalData::RT::Xe3) == sizeof(RayDispatchGlobalData), "unexpected size?");
309+
static_assert(offsetof(RayDispatchGlobalData::RT::Xe, common) == offsetof(RayDispatchGlobalData::RT::Xe3, common), "unexpected size?");
254310
#ifdef HAS_INCLUDE_TYPE_TRAITS
255311
static_assert(std::is_standard_layout<RayDispatchGlobalData>::value, "no?");
256312
#endif // HAS_INCLUDE_TYPE_TRAITS

IGC/AdaptorCommon/RayTracing/AutoGenRTStackAccessPrivateOS.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,16 @@ auto* _get_numDSSRTStacks_Xe(const Twine& _ReturnName = "")
900900
return V_2;
901901
}
902902

903+
auto* _get_maxBVHLevels_Xe3(const Twine& _ReturnName = "")
904+
{
905+
auto* V_0 = getGlobalBufferPtr();
906+
auto* V_1 = CreateInBoundsGEP(_struct_IGC__RayDispatchGlobalData(*Ctx.getModule()), V_0, { getInt64(0), getInt32(0), getInt32(0), getInt32(4), getInt32(0) });
907+
auto* V_2 = CreateLoad(getInt32Ty(), V_1);
908+
setInvariantLoad(V_2);
909+
auto* V_3 = CreateAnd(V_2, getInt32(7), _ReturnName);
910+
return V_3;
911+
}
912+
903913
auto* _get_statelessScratchPtr(const Twine& _ReturnName = "")
904914
{
905915
auto* V_0 = getGlobalBufferPtr();

IGC/AdaptorCommon/RayTracing/RTBuilder.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ SPDX-License-Identifier: MIT
3636

3737

3838

39-
4039
using namespace llvm;
4140
using namespace RTStackFormat;
4241
using namespace IGC;
@@ -91,6 +90,15 @@ void RTBuilder::setInvariantLoad(LoadInst* LI)
9190

9291
Value* RTBuilder::getRtMemBasePtr(void)
9392
{
93+
#define STYLE(X) { \
94+
using T = std::conditional_t< \
95+
std::is_same_v<RTStackFormat::X, RTStackFormat::Xe>, \
96+
RayDispatchGlobalData::RT::Xe, RayDispatchGlobalData::RT::Xe3>; \
97+
static_assert( \
98+
offsetof(RayDispatchGlobalData::RT::Xe, rtMemBasePtr) == \
99+
offsetof(T, rtMemBasePtr)); }
100+
#include "RayTracingMemoryStyle.h"
101+
#undef STYLE
94102
return _get_rtMemBasePtr_Xe(VALUE_NAME("rtMemBasePtr"));
95103
}
96104

@@ -1170,6 +1178,10 @@ std::pair<uint32_t, uint32_t> RTBuilder::getSliceIDBitsInSR0() const {
11701178
{
11711179
return {11, 15};
11721180
}
1181+
else if (Ctx.platform.GetPlatformFamily() == IGFX_XE3_CORE)
1182+
{
1183+
return {14, 17};
1184+
}
11731185
else
11741186
{
11751187
return {12, 14};
@@ -1186,6 +1198,10 @@ std::pair<uint32_t, uint32_t> RTBuilder::getSubsliceIDBitsInSR0() const {
11861198
{
11871199
return {8, 9};
11881200
}
1201+
else if (Ctx.platform.GetPlatformFamily() == IGFX_XE3_CORE)
1202+
{
1203+
return {8, 11};
1204+
}
11891205
else
11901206
{
11911207
return {8, 8};
@@ -1236,6 +1252,13 @@ Value* RTBuilder::getGlobalDSSID()
12361252
{
12371253
return emitStateRegID(dssIDBits.first, sliceIDBits.second);
12381254
}
1255+
else if (isChildOfXe3)
1256+
{
1257+
Value* sliceID = emitStateRegID(sliceIDBits.first, sliceIDBits.second);
1258+
Value* dssID = emitStateRegID(dssIDBits.first, dssIDBits.second);
1259+
Value* globalDSSID = CreateMul(sliceID, getInt32(NumDSSPerSlice));
1260+
return CreateAdd(globalDSSID, dssID);
1261+
}
12391262
else
12401263
{
12411264
Value* dssID = emitStateRegID(dssIDBits.first, dssIDBits.second);

IGC/AdaptorCommon/RayTracing/RTBuilder.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class RTBuilder : public IGCIRBuilder<>
5959

6060

6161
bool isChildOfXe2 = false;
62+
bool isChildOfXe3 = false;
6263

6364
// Field for explicit GlobalBufferPtr - used on OpenCL path.
6465
Value* GlobalBufferPtr = nullptr;
@@ -75,8 +76,36 @@ class RTBuilder : public IGCIRBuilder<>
7576
enabledSlices++;
7677
}
7778
}
79+
isChildOfXe3 = Ctx.platform.isCoreChildOf(IGFX_XE3_CORE);
7880
isChildOfXe2 = Ctx.platform.isCoreChildOf(IGFX_XE2_HPG_CORE);
7981

82+
if (isChildOfXe3)
83+
{
84+
EuCountPerDSS = SysInfo.MaxEuPerSubSlice;
85+
MaxDualSubSlicesSupported = 0;
86+
87+
IGC_ASSERT(NumDSSPerSlice <= GT_MAX_SUBSLICE_PER_SLICE);
88+
89+
for (unsigned int sliceID = 0; sliceID < GT_MAX_SLICE; ++sliceID)
90+
{
91+
if (SysInfo.SliceInfo[sliceID].Enabled)
92+
{
93+
NumDSSPerSlice = SysInfo.SliceInfo[sliceID].SubSliceEnabledCount;
94+
95+
// SubSliceInfo size is GT_MAX_SUBSLICE_PER_SLICE, but
96+
// actual number, calculated for given platform, of SubSlices is used
97+
// to iterate only through SubSlices present on the platform.
98+
for (unsigned int ssID = 0; ssID < NumDSSPerSlice; ++ssID)
99+
{
100+
if (SysInfo.SliceInfo[sliceID].SubSliceInfo[ssID].Enabled)
101+
{
102+
MaxDualSubSlicesSupported = std::max(MaxDualSubSlicesSupported, (sliceID * NumDSSPerSlice) + ssID + 1);
103+
}
104+
}
105+
}
106+
}
107+
}
108+
else // this will chain into if from Xe2 branch forming else if
80109
if (isChildOfXe2 || Ctx.platform.isProductChildOf(IGFX_PVC))
81110
{
82111
NumDSSPerSlice = SysInfo.MaxSubSlicesSupported / std::max(SysInfo.MaxSlicesSupported, enabledSlices);
@@ -254,6 +283,8 @@ class RTBuilder : public IGCIRBuilder<>
254283
Value* getStatelessScratchPtr(void);
255284
Value* getLeafType(StackPointerVal* StackPointer, bool CommittedHit);
256285
Value* getIsFrontFace(StackPointerVal* StackPointer, IGC::CallableShaderTypeMD ShaderTy);
286+
// Xe3: memhit->leafNodeSubType
287+
Value* getLeafNodeSubType(StackPointerVal* StackPointer, bool CommittedHit);
257288

258289
Value* CreateSyncStackPtrIntrinsic(Value* Addr, Type* PtrTy, bool AddDecoration);
259290

IGC/BiFModule/Headers/bif_flag_controls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@ BIF_FLAG_CONTROL(bool, UseHighAccuracyMath)
3636
BIF_FLAG_CONTROL(bool, EnableSWSrgbWrites)
3737
BIF_FLAG_CONTROL(int, MaxHWThreadIDPerSubDevice)
3838
BIF_FLAG_CONTROL(int, JointMatrixLoadStoreOpt)
39+
BIF_FLAG_CONTROL(bool, UseOOBChecks)
3940
#endif // __BIF_FLAG_CONTROL_H__

IGC/BiFModule/Implementation/group.cl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2229,6 +2229,10 @@ short SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformBroadcast, _i32_i16_i32, )
22292229
{
22302230
if (Execution == Subgroup)
22312231
{
2232+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
2233+
{
2234+
Id = Id & (get_max_sub_group_size() - 1);
2235+
}
22322236
return as_ushort(__builtin_IB_simd_shuffle_h(as_half(Value), Id));
22332237
}
22342238
else
@@ -2292,6 +2296,10 @@ double SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformBroadcast, _i32_f64_i32,
22922296
{
22932297
if (Execution == Subgroup)
22942298
{
2299+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
2300+
{
2301+
Id = Id & (get_max_sub_group_size() - 1);
2302+
}
22952303
return __builtin_IB_simd_shuffle_df( Value, Id );
22962304
}
22972305
else
@@ -3221,6 +3229,10 @@ double SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffle, _i32_f64_i32, )(
32213229
{
32223230
if (Execution == Subgroup)
32233231
{
3232+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3233+
{
3234+
c = c & (get_max_sub_group_size() - 1);
3235+
}
32243236
return __builtin_IB_simd_shuffle_df(x, c);
32253237
}
32263238
return 0;
@@ -3232,6 +3244,10 @@ half SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffle, _i32_f16_i32, )(in
32323244
{
32333245
if (Execution == Subgroup)
32343246
{
3247+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3248+
{
3249+
c = c & (get_max_sub_group_size() - 1);
3250+
}
32353251
return __builtin_IB_simd_shuffle_h(x, c);
32363252
}
32373253
return 0;
@@ -3264,6 +3280,10 @@ char SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_i8_i32, )
32643280
{
32653281
if (Execution == Subgroup)
32663282
{
3283+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3284+
{
3285+
c = c & (get_max_sub_group_size() - 1);
3286+
}
32673287
return __builtin_IB_simd_shuffle_down_uc(x, 0, c);
32683288
}
32693289
return 0;
@@ -3273,6 +3293,10 @@ short SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_i16_i32,
32733293
{
32743294
if (Execution == Subgroup)
32753295
{
3296+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3297+
{
3298+
c = c & (get_max_sub_group_size() - 1);
3299+
}
32763300
return __builtin_IB_simd_shuffle_down_us(x, 0, c);
32773301
}
32783302
return 0;
@@ -3282,6 +3306,10 @@ int SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_i32_i32, )
32823306
{
32833307
if (Execution == Subgroup)
32843308
{
3309+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3310+
{
3311+
c = c & (get_max_sub_group_size() - 1);
3312+
}
32853313
return __builtin_IB_simd_shuffle_down(x, 0, c);
32863314
}
32873315
return 0;
@@ -3291,6 +3319,10 @@ long SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_i64_i32,
32913319
{
32923320
if (Execution == Subgroup)
32933321
{
3322+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3323+
{
3324+
c = c & (get_max_sub_group_size() - 1);
3325+
}
32943326
uint2 X = as_uint2(x);
32953327
uint2 result = (uint2)(__builtin_IB_simd_shuffle_down(X.s0, 0, c),
32963328
__builtin_IB_simd_shuffle_down(X.s1, 0, c));
@@ -3303,6 +3335,10 @@ float SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_f32_i32,
33033335
{
33043336
if (Execution == Subgroup)
33053337
{
3338+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3339+
{
3340+
c = c & (get_max_sub_group_size() - 1);
3341+
}
33063342
return as_float(__builtin_IB_simd_shuffle_down(as_uint(x), 0, c));
33073343
}
33083344
return 0;
@@ -3313,6 +3349,10 @@ double SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_f64_i32
33133349
{
33143350
if (Execution == Subgroup)
33153351
{
3352+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3353+
{
3354+
c = c & (get_max_sub_group_size() - 1);
3355+
}
33163356
uint2 X = as_uint2(x);
33173357
uint2 result = (uint2)(__builtin_IB_simd_shuffle_down(X.s0, 0, c),
33183358
__builtin_IB_simd_shuffle_down(X.s1, 0, c));
@@ -3327,6 +3367,10 @@ half SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleDown, _i32_f16_i32,
33273367
{
33283368
if (Execution == Subgroup)
33293369
{
3370+
if(BIF_FLAG_CTRL_GET(UseOOBChecks))
3371+
{
3372+
c = c & (get_max_sub_group_size() - 1);
3373+
}
33303374
return as_half(__builtin_IB_simd_shuffle_down_us(as_ushort(x), 0, c));
33313375
}
33323376
return 0;
@@ -3339,6 +3383,10 @@ TYPE SPIRV_OVERLOADABLE SPIRV_BUILTIN(GroupNonUniformShuffleUp, _i32_##TYPE_ABBR
33393383
{ \
33403384
if (Execution == Subgroup) \
33413385
{ \
3386+
if(BIF_FLAG_CTRL_GET(UseOOBChecks)) \
3387+
{ \
3388+
c = c & (get_max_sub_group_size() - 1); \
3389+
} \
33423390
return intel_sub_group_shuffle_up((TYPE) 0, x, c); \
33433391
} \
33443392
return 0; \

IGC/Compiler/Builtins/BIFFlagCtrl/BIFFlagCtrlResolution.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ void BIFFlagCtrlResolution::FillFlagCtrl() {
104104
BIF_FLAG_CTRL_SET(JointMatrixLoadStoreOpt, IGC_GET_FLAG_VALUE(JointMatrixLoadStoreOpt));
105105
}
106106

107+
BIF_FLAG_CTRL_SET(UseOOBChecks, PtrCGC->platform.needsOutOfBoundsBuiltinChecks());
107108
}
108109

109110
#undef BIF_FLAG_CTRL_SET

0 commit comments

Comments
 (0)