Skip to content

Commit 772dced

Browse files
jgu222igcbot
authored andcommitted
Refactor some 2d block read
For some of 2D block read whose block size is half a GRF, the special handling (replacing it with the equivalent single-block read) is moved from emitVISAPass to LSCFuncsResolution so that address payload version can handle it correctly as well. There should be no functional change.
1 parent d8ec6e6 commit 772dced

File tree

2 files changed

+23
-19
lines changed

2 files changed

+23
-19
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24745,25 +24745,6 @@ void EmitPass::emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst)
2474524745
destination = BroadcastIfUniform(GetSymbol(inst->getOperand(storeDestinationOperandId)));
2474624746
}
2474724747

24748-
// Special handling of the following:
24749-
// intel_sub_group_2d_block_read_8b_1r32x2c
24750-
// intel_sub_group_2d_block_read_16b_1r16x2c
24751-
// intel_sub_group_2d_block_read_32b_1r8x2c
24752-
// They are defined to return 64 bytes, but the HW block read returns 128 bytes (two GRFs,
24753-
// as a block size must be multiple of GRF, unused part is zero-padded). Additional mov
24754-
// instructions are needed to pack the lower halves of each GRF as the final return value.
24755-
//
24756-
// Here, using equivalent single-block read to avoid those mov instructions by
24757-
// just doubling their width.
24758-
if (isRead && !isPrefetch && !isTranspose && !isVnni &&
24759-
numBlocksV == 2 && blockHeight == 1 &&
24760-
(elemSizeInBits * blockWidth) == 256 &&
24761-
m_currShader->m_Platform->getGRFSize() == 64)
24762-
{
24763-
blockWidth = (2 * blockWidth);
24764-
numBlocksV = 1;
24765-
}
24766-
2476724748
bool emu_read = (isRead && isTranspose &&
2476824749
(elemSizeInBits == 8 || elemSizeInBits == 16));
2476924750
if (!emu_read) {

IGC/Compiler/Optimizer/OpenCLPasses/LSCFuncs/LSCFuncsResolution.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,29 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
950950
{
951951
IGC_ASSERT_MESSAGE(funcName.consume_front("v2"), "Unrecognized v element in __builtin_IB_subgroup_block_read/write.");
952952
}
953+
954+
// Special handling of the following when GRF size = 64 bytes
955+
// intel_sub_group_2d_block_read_8b_1r32x2c (u8_m1k32v2)
956+
// intel_sub_group_2d_block_read_16b_1r16x2c (u16_m1k16v2)
957+
// intel_sub_group_2d_block_read_32b_1r8x2c (u32_m1k8v2)
958+
// They are defined to return 64 bytes, but the HW block read
959+
// returns 128 bytes (two GRFs, as a block size must be multiple
960+
// of GRF, unused part is zero-padded. Note that those APIs have
961+
// their block size to be multiple of GRF (1 GRF) when GRF size
962+
// is 32 bytes). Additional mov instructions are needed to pack
963+
// the lower halves of each GRF as the final return value.
964+
//
965+
// For those cases, instead of 2 blocks, using equivalent single-block
966+
// read to avoid those mov instructions by just doubling their width:
967+
// u8_m1k32v2 --> u8_m1k64v1
968+
// u16_m1k16v2 --> u16_m1k32v1
969+
// u32_m1k8v2 --> u32_m1k16v1
970+
if (m_pCtx->platform.getGRFSize() == 64 && isRead && !isPrefetch &&
971+
numBlocksV == 2 && tileHeight == 1 && (elemSize * tileWidth) == 256)
972+
{
973+
numBlocksV = 1;
974+
tileWidth *= 2;
975+
}
953976
}
954977
else if (isTranspose && !isVnniTransform)
955978
{

0 commit comments

Comments
 (0)