@@ -5509,7 +5509,9 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
55095509 else
55105510 {
55115511 m_encoder->Add(pTempVar0, pTempVar0, m_currShader->ImmToVariable(blockWidth, ISA_TYPE_UD));
5512- dstSubReg = dstSubReg + scale * blockHeight;
5512+ uint32_t subOffset = maxWidth * scale * blockHeight;
5513+ subOffset /= getGRFSize();
5514+ dstSubReg = dstSubReg + subOffset;
55135515 }
55145516 m_encoder->Push();
55155517
@@ -5548,6 +5550,8 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
55485550 {
55495551 dstSubReg = 0;
55505552
5553+ uint32_t srcSubReg = 0;
5554+
55515555 // Join data obtained from pass 0 and pass 1 to make
55525556 // xOffset contiguous from 0 to 63 bytes (making SIMD 16)
55535557 // mov (8) r20.0<1>:ud r28.0<8;8,1>:ud {Align1, Q1}
@@ -5559,16 +5563,63 @@ void EmitPass::emitSimdMediaBlockRead(llvm::Instruction* inst)
55595563 // mov (8) r26.0<1>:ud r31.0<8;8,1>:ud {Align1, Q1}
55605564 // mov (8) r27.0<1>:ud r35.0<8;8,1>:ud {Align1, Q2}
55615565
5562- for (uint32_t i = 0; i < blockHeight; i++)
5563- {
5564- for (uint32_t pass = 0; pass < numPasses; pass++)
5566+
5567+ //For 64 bytes GRF, 32 bytes will be extended to
5568+ //.....
5569+ // A0....A1
5570+ // B0....B1
5571+ // C0....C1
5572+ // D0....D1
5573+ // E0....E1
5574+ // F0....F1
5575+ // G0....G1
5576+ // H0....H1
5577+ //
5578+ // r20....A0....B0........r30....A1....B1
5579+ // r21....C0....D0........r31....C1....D1
5580+ // r22....E0....F0........r32....E1....F1
5581+ // r23....G0....H0........r33....G1....H1
5582+ //
5583+ // r40<--r20,....r30
5584+ // r41<--r20.8,r30.8
5585+ // r42<--r21,....r31
5586+ // r43<--r21.8,r31.8
5587+ // r44<--r22,....r32
5588+ // r45<--r22.8,r32.8
5589+ // r46<--r23,....r33
5590+ // r47<--r23.8,r33.8
5591+ //
5592+ //mov (8) r40.0<1>:ud r20.0<8;8,1>:ud {Align1, Q1}
5593+ //mov (8) r40.8<1>:ud r30.0<8;8,1>:ud {Align1, Q1}
5594+ //mov (8) r41<1>:ud r20.8<8;8,1>:ud {Align1, Q1}
5595+ //mov (8) r41.8<1>:ud r30.8<8;8,1>:ud {Align1, Q1}
5596+
5597+ for (uint32_t i = 0; i < blockHeight; i++) //Height
5598+ {
5599+ uint32_t dstSubRegOffset = 0;
5600+ uint32_t srcSubRegOffset = 0;
5601+
5602+ for (uint32_t pass = 0; pass < numPasses; pass++) //Width
55655603 {
55665604 SIMDMode mode = typeSizeInBytes == 8 && blockWidth != 64 ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
55675605 m_encoder->SetSimdSize(mode);
55685606 m_encoder->SetNoMask();
5569- m_encoder->SetSrcSubVar(0, scale * (i + (blockHeight * pass)));
5607+
5608+ srcSubReg = (scale * (i + (blockHeight * pass)) * maxWidth) / getGRFSize();
5609+ srcSubRegOffset = (i * maxWidth) % getGRFSize();
5610+
5611+ m_encoder->SetSrcSubVar(0, srcSubReg);
5612+ m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
5613+
55705614 m_encoder->SetDstSubVar(dstSubReg);
5571- dstSubReg += scale;
5615+ m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
5616+
5617+ dstSubRegOffset = ((pass + 1) * maxWidth) % getGRFSize();
5618+ if (dstSubRegOffset == 0)
5619+ {
5620+ dstSubReg += scale;
5621+ }
5622+
55725623 m_encoder->Copy(m_destination, pTempDest);
55735624 m_encoder->Push();
55745625 }
@@ -5641,8 +5692,10 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
56415692 int scale = (blockWidth == 64) ? 2 : 1;
56425693 for (pass = 0; pass < numPasses; pass++)
56435694 {
5644- uint32_t srcSubVar = pass * scale;
5695+ uint32_t srcSubVar = pass * scale * maxWidth / getGRFSize() ;
56455696 uint32_t dstSubVar = 0;
5697+ uint32_t srcSubRegOffset = (pass * maxWidth) % getGRFSize();
5698+ uint32_t dstSubRegOffset = 0;
56465699
56475700 CVariable* tempdst = nullptr;
56485701 tempdst = m_currShader->GetNewVariable(
@@ -5655,17 +5708,52 @@ void EmitPass::emitSimdMediaBlockWrite(llvm::Instruction* inst)
56555708 // mov (8) r23.0<1>:d r16.0<8;8,1>:d {Align1, Q1, Compacted}
56565709 // mov (8) r24.0<1>:d r18.0<8;8,1>:d {Align1, Q1, Compacted}
56575710 // mov (8) r25.0<1>:d r20.0<8;8,1>:d {Align1, Q1, Compacted}
5711+
5712+ //FOR 64 bytes GRF:
5713+ // A0....A1....A2....A3........r60....r60.8....r61....r61.8
5714+ // B0....B1....B2....B3........r62....r62.8....r63....r63.8
5715+ // C0....C1....C2....C3........r64....r64.8....r65....r65.8
5716+ // D0....D1....D2....D3........r66....r66.8....r67....r67.8
5717+ // E0....E1....E2....E3........r68....r68.8....r69....r69.8
5718+ // F0....F1....F2....F3........r70....r70.8....r71....r71.8
5719+ // G0....G1....G2....G3........r72....r72.8....r73....r73.8
5720+ // H0....H1....H2....H3........r74....r74.8....r75....r75.8
5721+ //
5722+ // block 0
5723+ // mov (8) r20.0<1>:d r60.0<8;8,1>:d {Align1, Q1, Compacted}
5724+ // mov (8) r20.8<1>:d r62.0<8;8,1>:d {Align1, Q1, Compacted}
5725+ // mov (8) r21.0<1>:d r64.0<8;8,1>:d {Align1, Q1, Compacted}
5726+ // mov (8) r21.8<1>:d rr66.0<8;8,1>:d {Align1, Q1, Compacted}
5727+ // ...
5728+ //block 1
5729+ // mov (8) r30.0<1>:d r60.8<8;8,1>:d {Align1, Q1, Compacted}
5730+ // mov (8) r30.8<1>:d r62.8<8;8,1>:d {Align1, Q1, Compacted}
5731+ // mov (8) r31.0<1>:d r64.8<8;8,1>:d {Align1, Q1, Compacted}
5732+ // mov (8) r31.8<1>:d rr66.8<8;8,1>:d {Align1, Q1, Compacted}
5733+ //...
5734+
56585735 if (numPasses > 1)
56595736 {
56605737 for (uint i = 0; i < nbElements; ++i)
56615738 {
56625739 SIMDMode mode = (typeSizeInBytes == 8 && blockWidth != 64) ? SIMDMode::SIMD4 : SIMDMode::SIMD8;
56635740 m_encoder->SetSimdSize(mode);
56645741 m_encoder->SetNoMask();
5742+
5743+ //Src
56655744 m_encoder->SetSrcSubVar(0, srcSubVar);
5745+ m_encoder->SetSrcSubReg(0, srcSubRegOffset / typeSizeInBytes);
5746+ //Dst
56665747 m_encoder->SetDstSubVar(dstSubVar);
5667- dstSubVar += scale;
5668- srcSubVar = srcSubVar + scale * numPasses;
5748+ m_encoder->SetDstSubReg(dstSubRegOffset / typeSizeInBytes);
5749+ //Strides for dst and src
5750+ dstSubRegOffset = ((i + 1) * maxWidth) % getGRFSize();
5751+ if (dstSubRegOffset == 0)
5752+ {
5753+ dstSubVar += scale;
5754+ }
5755+ srcSubVar = srcSubVar + (scale * numPasses * blockWidth / getGRFSize());
5756+
56695757 m_encoder->Copy(tempdst, data);
56705758 m_encoder->Push();
56715759 }
0 commit comments