Skip to content

Commit d85d0be

Browse files
pkwasnie-inteligcbot
authored andcommitted
2D block I/O for SIMD32
Expose SPIR-V API for 2D block load/store/prefetch for SIMD32 kernels. Works for platforms with minimum subgroup-size=16.
1 parent e679b1d commit d85d0be

File tree

9 files changed

+858
-145
lines changed

9 files changed

+858
-145
lines changed

IGC/BiFModule/Implementation/IGCBiF_Intrinsics_Lsc.cl

Lines changed: 193 additions & 142 deletions
Large diffs are not rendered by default.

IGC/BiFModule/Languages/OpenCL/IBiF_Sub_Groups.cl

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,7 +1016,7 @@ DEFN_INTEL_SUB_GROUP_BLOCK_WRITE_LSC_CACHEOPTS(intel_subgroup_block_write_cacheo
10161016
// To support new SPV_INTEL_2d_block_io only without matching cl_intel_subgroup_2d_block_io built-in, use
10171017
// DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_ macros
10181018

1019-
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1019+
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ_CACHE_CONTROLS(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
10201020
INLINE void __internal_##FUNC_NAME##_cache_controls(__global void* base_address, int width, int height, int pitch, int2 coord, __private void* destination, enum LSC_LDCC cache_controls) \
10211021
{ \
10221022
long baseoffset = as_long(base_address); \
@@ -1027,11 +1027,21 @@ INLINE void __internal_##FUNC_NAME##_cache_controls(__global void* base_address,
10271027
*(__private INTERNAL_DST_TYPE*)destination = ret; \
10281028
}
10291029

1030+
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1031+
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ_CACHE_CONTROLS(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1032+
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ_CACHE_CONTROLS(FUNC_NAME##_sg32, HALVE_TYPE(INTERNAL_DST_TYPE), INTERNAL_FUNC##_sg32)
1033+
1034+
// The same 2D block dimensions use different data type per work item
1035+
// depending on the subgroup size. Define unique functions for each variant.
10301036
#define DEFN_INTEL_SUB_GROUP_2D_BLOCK_READ(FUNC_NAME, DST_PTR_TYPE, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
10311037
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_READ(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
10321038
INLINE void OVERLOADABLE FUNC_NAME(__global void* base_address, int width, int height, int pitch, int2 coord, __private DST_PTR_TYPE* destination) \
10331039
{ \
10341040
__internal_##FUNC_NAME##_cache_controls(base_address, width, height, pitch, coord, (__private void *)destination, LSC_LDCC_DEFAULT); \
1041+
} \
1042+
INLINE void OVERLOADABLE FUNC_NAME##_sg32(__global void* base_address, int width, int height, int pitch, int2 coord, __private DST_PTR_TYPE* destination) \
1043+
{ \
1044+
__internal_##FUNC_NAME##_sg32_cache_controls(base_address, width, height, pitch, coord, (__private void *)destination, LSC_LDCC_DEFAULT); \
10351045
}
10361046

10371047
// type d8, block width 16, array length 1
@@ -1237,7 +1247,7 @@ DEFN_INTEL_SUB_GROUP_2D_BLOCK_PREFETCH(intel_sub_group_2d_block_prefetch_32b_32r
12371247
DEFN_INTEL_SUB_GROUP_2D_BLOCK_PREFETCH(intel_sub_group_2d_block_prefetch_8b_32r16x2c, __builtin_IB_subgroup_block_read_prefetch_u8_m32k16v2)
12381248
DEFN_INTEL_SUB_GROUP_2D_BLOCK_PREFETCH(intel_sub_group_2d_block_prefetch_8b_32r16x4c, __builtin_IB_subgroup_block_read_prefetch_u8_m32k16v4)
12391249

1240-
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1250+
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE_CACHE_CONTROLS(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
12411251
INLINE void __internal_##FUNC_NAME##_cache_controls(__global void* base_address, int width, int height, int pitch, int2 coord, private void* val, enum LSC_LDCC cache_controls) \
12421252
{ \
12431253
long baseoffset = as_long(base_address); \
@@ -1247,11 +1257,21 @@ INLINE void __internal_##FUNC_NAME##_cache_controls(__global void* base_address,
12471257
INTERNAL_FUNC(baseoffset, width_minus_one, height_minus_one, pitch_minus_one, coord, *(private INTERNAL_DST_TYPE*)val, cache_controls); \
12481258
}
12491259

1260+
#define DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1261+
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE_CACHE_CONTROLS(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
1262+
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE_CACHE_CONTROLS(FUNC_NAME##_sg32, HALVE_TYPE(INTERNAL_DST_TYPE), INTERNAL_FUNC##_sg32)
1263+
1264+
// The same 2D block dimensions use different data type per work item
1265+
// depending on the subgroup size. Define unique functions for each variant.
12501266
#define DEFN_INTEL_SUB_GROUP_2D_BLOCK_WRITE(FUNC_NAME, DST_PTR_TYPE, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
12511267
DEFN_INTERNAL_INTEL_SUB_GROUP_2D_BLOCK_WRITE(FUNC_NAME, INTERNAL_DST_TYPE, INTERNAL_FUNC) \
12521268
INLINE void OVERLOADABLE FUNC_NAME(__global void* base_address, int width, int height, int pitch, int2 coord, private DST_PTR_TYPE* val) \
12531269
{ \
12541270
__internal_##FUNC_NAME##_cache_controls(base_address, width, height, pitch, coord, (private void*) val, LSC_LDCC_DEFAULT); \
1271+
} \
1272+
INLINE void OVERLOADABLE FUNC_NAME##_sg32(__global void* base_address, int width, int height, int pitch, int2 coord, private DST_PTR_TYPE* val) \
1273+
{ \
1274+
__internal_##FUNC_NAME##_sg32_cache_controls(base_address, width, height, pitch, coord, (private void*) val, LSC_LDCC_DEFAULT); \
12551275
}
12561276

12571277
DEFN_INTEL_SUB_GROUP_2D_BLOCK_WRITE(intel_sub_group_2d_block_write_8b_1r32x1c, ushort, ushort, __builtin_IB_subgroup_block_write_cacheopts_u8_m1k32v1)

IGC/Compiler/CISACodeGen/helper.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2278,6 +2278,13 @@ Function *getUniqueEntryFunc(const IGCMD::MetaDataUtils *pM, IGC::ModuleMetaData
22782278
return entryFunc;
22792279
}
22802280

2281+
int getSIMDSize(const IGCMD::MetaDataUtils *M, llvm::Function *F) {
2282+
if (M->findFunctionsInfoItem(F) != M->end_FunctionsInfo()) {
2283+
return M->getFunctionsInfoItem(F)->getSubGroupSize()->getSIMDSize();
2284+
}
2285+
return 0;
2286+
}
2287+
22812288
// If true, the codegen will likely not emit instruction for this instruction.
22822289
bool isNoOpInst(Instruction *I, CodeGenContext *Ctx) {
22832290
if (isa<BitCastInst>(I) || isa<IntToPtrInst>(I) || isa<PtrToIntInst>(I)) {

IGC/Compiler/CISACodeGen/helper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,10 @@ inline bool isNonEntryMultirateShader(const llvm::Function *CF) {
377377
// All subsequent calls to this function will get the entry set by the first call.
378378
llvm::Function *getUniqueEntryFunc(const IGCMD::MetaDataUtils *pM, IGC::ModuleMetaData *pModMD);
379379

380+
// Returns a SIMD size for given function from metadata.
381+
// Returns 0 if function is not in metadata or function has not defined SIMD size.
382+
int getSIMDSize(const IGCMD::MetaDataUtils *M, llvm::Function *F);
383+
380384
template <typename T> inline bool RTWriteHasSource0Alpha(const T *rtWrite, ModuleMetaData *md) {
381385
return (nullptr != rtWrite->getSource0Alpha()) && !llvm::isa<llvm::UndefValue>(rtWrite->getSource0Alpha());
382386
}

IGC/Compiler/Optimizer/OpenCLPasses/LSCFuncs/LSCFuncsResolution.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,13 @@ Instruction *LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst &
10151015
numBlocksV = 4;
10161016
}
10171017

1018+
// In sub-group size = 32, block dimensions are the same as sub-group size = 16.
1019+
// __builtin_IB_subgroup_block_read_cacheopts_transform_u8_k32_sg32
1020+
// __builtin_IB_subgroup_block_read_cacheopts_transform_u8_k32n16v2_sg32
1021+
if (funcName.consume_front("_sg32")) {
1022+
tileWidth = 16;
1023+
}
1024+
10181025
// __builtin_IB_subgroup_block_read_flat_transform_u8_k32
10191026
tileHeight = 32;
10201027
} else {
@@ -1041,6 +1048,13 @@ Instruction *LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst &
10411048
if (funcName.consume_front("v2")) {
10421049
numBlocksV = 2;
10431050
}
1051+
1052+
// In sub-group size = 32, block imensions are the same as sub-group size = 16.
1053+
// __builtin_IB_subgroup_block_read_cacheopts_transform_u16_k16_sg32
1054+
// __builtin_IB_subgroup_block_read_cacheopts_transform_u16_k32n16v1_sg32
1055+
if (funcName.consume_front("_sg32")) {
1056+
tileWidth = 16;
1057+
}
10441058
}
10451059
} else {
10461060
IGC_ASSERT_MESSAGE(0, "Transpose and transform should not be used together.");

IGC/Compiler/Optimizer/OpenCLPasses/Spv2dBlockIOResolution/Spv2dBlockIOResolution.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,11 @@ template <typename CCT> void Spv2dBlockIOResolution::visit2DBlockSPVCallInst(Cal
175175
return;
176176
}
177177

178-
newFuncName << elemSize << "b_" << tileHeight << "r" << tileWidth << "x" << numBlocksV << "c_cache_controls";
178+
int simdSize = IGC::getSIMDSize(getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils(), CI.getParent()->getParent());
179+
bool isSimd32Op = simdSize == 32 && op != Prefetch;
180+
181+
newFuncName << elemSize << "b_" << tileHeight << "r" << tileWidth << "x" << numBlocksV << "c"
182+
<< (isSimd32Op ? "_sg32" : "") << "_cache_controls";
179183
auto newFunction = m_Module->getOrInsertFunction(newFuncName.str(), FT);
180184

181185
auto newCall = CallInst::Create(newFunction, args, "", &CI);

0 commit comments

Comments
 (0)