File tree Expand file tree Collapse file tree 2 files changed +23
-19
lines changed
Optimizer/OpenCLPasses/LSCFuncs Expand file tree Collapse file tree 2 files changed +23
-19
lines changed Original file line number Diff line number Diff line change @@ -24745,25 +24745,6 @@ void EmitPass::emitLSC2DBlockOperation(llvm::GenIntrinsicInst* inst)
2474524745 destination = BroadcastIfUniform(GetSymbol(inst->getOperand(storeDestinationOperandId)));
2474624746 }
2474724747
24748- // Special handling of the following:
24749- // intel_sub_group_2d_block_read_8b_1r32x2c
24750- // intel_sub_group_2d_block_read_16b_1r16x2c
24751- // intel_sub_group_2d_block_read_32b_1r8x2c
24752- // They are defined to return 64 bytes, but the HW block read returns 128 bytes (two GRFs,
24753- // as a block size must be multiple of GRF, unused part is zero-padded). Additional mov
24754- // instructions are needed to pack the lower halves of each GRF as the final return value.
24755- //
24756- // Here, using equivalent single-block read to avoid those mov instructions by
24757- // just doubling their width.
24758- if (isRead && !isPrefetch && !isTranspose && !isVnni &&
24759- numBlocksV == 2 && blockHeight == 1 &&
24760- (elemSizeInBits * blockWidth) == 256 &&
24761- m_currShader->m_Platform->getGRFSize() == 64)
24762- {
24763- blockWidth = (2 * blockWidth);
24764- numBlocksV = 1;
24765- }
24766-
2476724748 bool emu_read = (isRead && isTranspose &&
2476824749 (elemSizeInBits == 8 || elemSizeInBits == 16));
2476924750 if (!emu_read) {
Original file line number Diff line number Diff line change @@ -950,6 +950,29 @@ Instruction* LSCFuncsResolution::CreateSubGroup2DBlockOperation(llvm::CallInst&
950950 {
951951 IGC_ASSERT_MESSAGE (funcName.consume_front (" v2" ), " Unrecognized v element in __builtin_IB_subgroup_block_read/write." );
952952 }
953+
954+ // Special handling of the following when GRF size = 64 bytes
955+ // intel_sub_group_2d_block_read_8b_1r32x2c (u8_m1k32v2)
956+ // intel_sub_group_2d_block_read_16b_1r16x2c (u16_m1k16v2)
957+ // intel_sub_group_2d_block_read_32b_1r8x2c (u32_m1k8v2)
958+ // They are defined to return 64 bytes, but the HW block read
959+ // returns 128 bytes (two GRFs, as a block size must be multiple
960+ // of GRF, unused part is zero-padded. Note that those APIs have
961+ // their block size to be multiple of GRF (1 GRF) when GRF size
962+ // is 32 bytes). Additional mov instructions are needed to pack
963+ // the lower halves of each GRF as the final return value.
964+ //
965+ // For those cases, instead of 2 blocks, using equivalent single-block
966+ // read to avoid those mov instructions by just doubling their width:
967+ // u8_m1k32v2 --> u8_m1k64v1
968+ // u16_m1k16v2 --> u16_m1k32v1
969+ // u32_m1k8v2 --> u32_m1k16v1
970+ if (m_pCtx->platform .getGRFSize () == 64 && isRead && !isPrefetch &&
971+ numBlocksV == 2 && tileHeight == 1 && (elemSize * tileWidth) == 256 )
972+ {
973+ numBlocksV = 1 ;
974+ tileWidth *= 2 ;
975+ }
953976 }
954977 else if (isTranspose && !isVnniTransform)
955978 {
You can’t perform that action at this time.
0 commit comments