@@ -10088,48 +10088,63 @@ void EmitPass::emitInsert(llvm::Instruction* inst)
1008810088 m_encoder->Push();
1008910089
1009010090 // a0 = addressof(vector variable) + offset2 <-- address of element to insert at
10091- CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
10092- pIndexVar->IsUniform() ? 1 : numLanes(m_currShader->m_SIMDSize),
10093- m_destination->GetType(),
10094- pIndexVar->IsUniform(),
10095- pInstVar->IsUniform());
10096-
10097- m_encoder->AddrAdd(pDstArrElm, m_destination, pOffset2);
10098- m_encoder->Push();
10099-
10100- // If pIndexVar is uniform, we are using 1x1 indirect addressing and
10101- // a single copy is what we need.
1010210091 if (pIndexVar->IsUniform())
1010310092 {
10093+ CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(1, m_destination->GetType(), true, pInstVar->IsUniform());
10094+ m_encoder->AddrAdd(pDstArrElm, m_destination, pOffset2);
10095+ m_encoder->Push();
1010410096 m_encoder->Copy(pDstArrElm, pElemVar);
1010510097 m_encoder->Push();
1010610098 }
1010710099 else
1010810100 {
10109- // Handle the case when the index is non-uniform - we need to lookup a different value
10110- // for each simd lane.
10111- // Since HW doesn't support writing to more than two consecutive GRFs, we need to simulate
10112- // scattered write by a sequence of instructions, each one writing to a single simd-lane.
10113- for (uint lane = 0; lane < numLanes(m_currShader->m_SIMDSize); ++lane)
10101+ int loopCount = (m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance == 1) ? 2 : 1;
10102+ for (int i = 0; i < loopCount; ++i)
1011410103 {
10115- CVariable* immMask = m_currShader->ImmToVariable(1ULL << lane, ISA_TYPE_UD);
10116- CVariable* dstPred = m_currShader->GetNewVariable(
10117- numLanes(m_SimdMode),
10118- ISA_TYPE_BOOL,
10119- EALIGN_BYTE);
10104+ if (i == 1)
10105+ {
10106+ // explicitly set second half as we are manually splitting
10107+ m_encoder->SetSecondHalf(true);
10108+ }
10109+ SIMDMode simdMode = std::min(m_currShader->m_SIMDSize, SIMDMode::SIMD16);
10110+ CVariable* pDstArrElm = m_currShader->GetNewAddressVariable(
10111+ numLanes(simdMode),
10112+ m_destination->GetType(),
10113+ false,
10114+ pInstVar->IsUniform());
1012010115
10121- m_encoder->SetP(dstPred, immMask);
10116+ m_encoder->SetSimdSize(simdMode);
10117+ m_encoder->AddrAdd(pDstArrElm, m_destination, pOffset2);
1012210118 m_encoder->Push();
1012310119
10124- m_encoder->SetPredicate(dstPred);
10125- if (!pElemVar->IsUniform())
10120+ // Handle the case when the index is non-uniform - we need to lookup a different value
10121+ // for each simd lane.
10122+ // Since HW doesn't support scattered GRF writes, we need to simulate
10123+ // scattered write by a sequence of instructions, each one writing to a single simd-lane.
10124+ for (uint lane = 0; lane < numLanes(simdMode); ++lane)
1012610125 {
10127- m_encoder->SetSrcSubReg(0, lane);
10126+ uint position = lane + i * 16;
10127+ CVariable* immMask = m_currShader->ImmToVariable(1ULL << lane, ISA_TYPE_UD);
10128+ CVariable* dstPred = m_currShader->GetNewVariable(
10129+ numLanes(m_SimdMode),
10130+ ISA_TYPE_BOOL,
10131+ EALIGN_BYTE);
10132+
10133+ m_encoder->SetSimdSize(simdMode);
10134+ m_encoder->SetP(dstPred, immMask);
10135+ m_encoder->Push();
10136+
10137+ m_encoder->SetPredicate(dstPred);
10138+ if (!pElemVar->IsUniform())
10139+ {
10140+ m_encoder->SetSrcSubReg(0, position);
10141+ }
10142+ m_encoder->SetSrcRegion(0, 0, 1, 0);
10143+ m_encoder->SetDstSubReg(lane);
10144+ m_encoder->SetSimdSize(simdMode);
10145+ m_encoder->Copy(pDstArrElm, pElemVar);
10146+ m_encoder->Push();
1012810147 }
10129- m_encoder->SetSrcRegion(0, 0, 1, 0);
10130- m_encoder->SetDstSubReg(lane);
10131- m_encoder->Copy(pDstArrElm, pElemVar);
10132- m_encoder->Push();
1013310148 }
1013410149 }
1013510150 }
0 commit comments