@@ -15752,98 +15752,84 @@ void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immO
1575215752// In addition, if 64bit add is not supported, emitAddPair() will be used to
1575315753// use 32bit add/addc to emulate 64bit add.
1575415754//
15755- // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is its return var.
15756- // The argument 'DataVar' in prepareDataForUniform() is uniform, so is its return var.
15755+ // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is
15756+ // its return var. The argument 'DataVar' in prepareDataForUniform() is uniform,
15757+ // so is its return var.
1575715758//
1575815759CVariable* EmitPass::prepareAddressForUniform(
15759- CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz , e_alignment Align)
15760+ CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t RequiredNElts , e_alignment Align)
1576015761{
15762+ // If RequiredNElts == 0, use next power of 2 of NElts as return var's num of elements.
15763+ // otherwise, user RequiredNElts as return var's num of elements.
15764+ uint32_t pow2NElts = (uint32_t)PowerOf2Ceil(NElts);
15765+ uint32_t allocNElts = (RequiredNElts > 0 ? RequiredNElts : pow2NElts);
1576115766 IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15762- if (ExecSz == 1 && AddrVar->IsGRFAligned(Align))
15767+ IGC_ASSERT(allocNElts >= pow2NElts);
15768+ if (allocNElts == NElts && NElts == 1 && AddrVar->IsGRFAligned(Align))
1576315769 {
15770+ // No need to create a new var.
1576415771 return AddrVar;
1576515772 }
1576615773 bool isA64 = (AddrVar->GetElemSize() == 8);
15767- SIMDMode simdmode = lanesToSIMDMode(ExecSz );
15768- CVariable* newVar = m_currShader->GetNewVariable(ExecSz , AddrVar->GetType(), Align, true, CName::NONE);
15774+ SIMDMode simdmode = lanesToSIMDMode(pow2NElts );
15775+ CVariable* newVar = m_currShader->GetNewVariable(allocNElts , AddrVar->GetType(), Align, true, CName::NONE);
1576915776
1577015777 CVariable* off;
1577115778 uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
15772- if ((ExecSz <= 4 && EltBytes == 4) || (ExecSz <= 2 && EltBytes == 8))
15779+ if ((pow2NElts <= 4 && EltBytes == 4) || (pow2NElts <= 2 && EltBytes == 8))
1577315780 {
1577415781 // This case needs a single UV immediate
1577515782 incImm = incImm << (EltBytes == 4 ? 2 : 3);
1577615783 off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
1577715784 }
1577815785 else
1577915786 {
15780- // Need a temporary var to calculate offsets
15781- off = m_currShader->GetNewVariable(ExecSz, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
15787+ // Need a temporary var to calculate offsets.
15788+ // (Note that the temp is non-uniform, otherwise emitAddrPair() won't work.)
15789+ off = m_currShader->GetNewVariable(pow2NElts, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
1578215790
15783- // actualES is the actual execsize used for computing offsets.
15784- uint32_t actualES = (uint32_t)PowerOf2Ceil(NElts);
15785-
15786- // incImm is UV type and can be used in execsize <= 8 only. If ExecSz is greater
15787- // than the actual number of lanes (for example, 4GRF alignment case), the upper lanes
15788- // beyond need to be zero'ed.
15789- if (ExecSz > actualES)
15790- {
15791- // Need to zero the upper lanes.
15792- m_encoder->SetNoMask();
15793- m_encoder->SetSimdSize(simdmode);
15794- m_encoder->Copy(off, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
15795- m_encoder->Push();
15796- }
15797-
15798- SIMDMode sm = lanesToSIMDMode(actualES);
15799- if (incImm > 0 &&
15800- ((actualES <= 4 && EltBytes == 4) || (actualES <= 2 && EltBytes == 8)))
15801- {
15802- // This case needs a single UV immediate
15803- incImm = incImm << (EltBytes == 4 ? 2 : 3);
15791+ // Need a mov and mul
15792+ m_encoder->SetNoMask();
15793+ m_encoder->SetSimdSize(simdmode);
15794+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15795+ m_encoder->Push();
1580415796
15805- m_encoder->SetNoMask();
15806- m_encoder->SetSimdSize(sm);
15807- m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15808- m_encoder->Push();
15809- }
15810- else if (incImm > 0)
15811- {
15812- // Need a mov and mul
15813- m_encoder->SetNoMask();
15814- m_encoder->SetSimdSize(sm);
15815- m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15816- m_encoder->Push();
15797+ m_encoder->SetNoMask();
15798+ m_encoder->SetSimdSize(simdmode);
15799+ m_encoder->SetSrcRegion(0, 1, 1, 0);
15800+ m_encoder->SetSrcRegion(1, 0, 1, 0);
15801+ m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15802+ m_encoder->Push();
15803+ }
1581715804
15818- m_encoder->SetNoMask();
15819- m_encoder->SetSimdSize(sm);
15820- m_encoder->SetSrcRegion(0, 1, 1, 0);
15821- m_encoder->SetSrcRegion(1, 0, 1, 0);
15822- m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15823- m_encoder->Push();
15824- }
15805+ // Only need to initialize pow2NElts elements.
15806+ if (allocNElts > pow2NElts)
15807+ {
15808+ newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
1582515809 }
1582615810
15827- // May need splitting for A64
15828- bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15811+ // Currently, it's impossible to split because of NElts <= 8. In the future, NElts
15812+ // could be 32 and we could need to split.
15813+ bool needSplit = ((pow2NElts * newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
1582915814 if (needSplit)
1583015815 {
1583115816 IGC_ASSERT(!off->IsImmediate());
15832- uint32_t bytes1 = (ExecSz / 2) * newVar->GetElemSize();
15833- uint32_t bytes2 = (ExecSz / 2) * off->GetElemSize();
15834- CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, ExecSz / 2);
15835- CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, ExecSz / 2);
15836- CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, ExecSz / 2);
15837- CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, ExecSz / 2);
15817+ uint32_t halfNElts = pow2NElts / 2;
15818+ uint32_t bytes1 = halfNElts * newVar->GetElemSize();
15819+ uint32_t bytes2 = halfNElts * off->GetElemSize();
15820+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, halfNElts);
15821+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, halfNElts);
15822+ CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, halfNElts);
15823+ CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, halfNElts);
1583815824
15839- if (m_currShader->m_Platform->hasNoInt64Inst())
15825+ if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
1584015826 {
1584115827 emitAddPair(newVarHi, AddrVar, offHi);
1584215828 emitAddPair(newVarLo, AddrVar, offLo);
1584315829 }
1584415830 else
1584515831 {
15846- SIMDMode sm = lanesToSIMDMode(ExecSz / 2 );
15832+ SIMDMode sm = lanesToSIMDMode(halfNElts );
1584715833 m_encoder->SetNoMask();
1584815834 m_encoder->SetUniformSIMDSize(sm);
1584915835 m_encoder->SetSrcRegion(0, 0, 1, 0);
@@ -15859,7 +15845,7 @@ CVariable* EmitPass::prepareAddressForUniform(
1585915845 m_encoder->Push();
1586015846 }
1586115847 }
15862- else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
15848+ else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst() && pow2NElts > 1 )
1586315849 {
1586415850 emitAddPair(newVar, AddrVar, off);
1586515851 }
@@ -15869,59 +15855,73 @@ CVariable* EmitPass::prepareAddressForUniform(
1586915855 m_encoder->SetUniformSIMDSize(simdmode);
1587015856 m_encoder->SetSrcRegion(0, 0, 1, 0);
1587115857 m_encoder->SetSrcRegion(1, 1, 1, 0);
15872- m_encoder->Add(newVar, AddrVar, off);
15858+ if (pow2NElts > 1) {
15859+ m_encoder->Add(newVar, AddrVar, off);
15860+ }
15861+ else {
15862+ m_encoder->Copy(newVar, AddrVar);
15863+ }
1587315864 m_encoder->Push();
1587415865 }
1587515866 return newVar;
1587615867}
1587715868
1587815869CVariable* EmitPass::prepareDataForUniform(
15879- CVariable* DataVar, uint32_t ExecSz , e_alignment Align)
15870+ CVariable* DataVar, uint32_t RequiredNElts , e_alignment Align)
1588015871{
1588115872 uint32_t NElts = DataVar->GetNumberElement();
1588215873 uint32_t EltBytes = DataVar->GetElemSize();
15883- IGC_ASSERT(ExecSz >= NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15884- if (NElts == ExecSz && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
15874+ uint32_t pow2NElts = (uint32_t)(uint32_t)PowerOf2Ceil(NElts);
15875+ uint32_t allocNElts = RequiredNElts > 0 ? RequiredNElts : pow2NElts;
15876+ IGC_ASSERT(allocNElts >= pow2NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15877+ if (NElts == allocNElts && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
1588515878 {
1588615879 return DataVar;
1588715880 }
15888- CVariable* newVar = m_currShader->GetNewVariable(ExecSz , DataVar->GetType(), Align, true, CName::NONE);
15881+ CVariable* newVar = m_currShader->GetNewVariable(allocNElts , DataVar->GetType(), Align, true, CName::NONE);
1588915882
15890- // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15891- bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15892- if (needSplit)
15883+ // Need to return a var with pow2NElts elements
15884+ if (allocNElts > pow2NElts)
1589315885 {
15894- uint32_t esz = ExecSz / 2;
15895- uint32_t bytes = esz * newVar->GetElemSize();
15896- CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15897- CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
15898-
15899- m_encoder->SetNoMask();
15900- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15901- m_encoder->SetSrcRegion(0, 0, 1, 0);
15902- m_encoder->Copy(newVarHi, DataVar);
15903- m_encoder->Push();
15904-
15905- m_encoder->SetNoMask();
15906- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15907- m_encoder->SetSrcRegion(0, 0, 1, 0);
15908- m_encoder->Copy(newVarLo, DataVar);
15909- m_encoder->Push();
15886+ newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
1591015887 }
15911- else
15888+
15889+ // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15890+ bool initWithElem0 = (pow2NElts > NElts);
15891+ bool needSplit = ((pow2NElts *newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
15892+ if (initWithElem0)
1591215893 {
15894+ if (needSplit)
15895+ {
15896+ uint32_t esz = pow2NElts / 2;
15897+ uint32_t bytes = esz * newVar->GetElemSize();
15898+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15899+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
1591315900
15914- m_encoder->SetNoMask();
15915- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(ExecSz));
15916- m_encoder->SetSrcRegion(0, 0, 1, 0);
15917- m_encoder->Copy(newVar, DataVar);
15918- m_encoder->Push();
15901+ m_encoder->SetNoMask();
15902+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15903+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15904+ m_encoder->Copy(newVarHi, DataVar);
15905+ m_encoder->Push();
15906+
15907+ m_encoder->SetNoMask();
15908+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15909+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15910+ m_encoder->Copy(newVarLo, DataVar);
15911+ m_encoder->Push();
15912+ }
15913+ else
15914+ {
15915+ m_encoder->SetNoMask();
15916+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(pow2NElts));
15917+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15918+ m_encoder->Copy(newVar, DataVar);
15919+ m_encoder->Push();
15920+ }
1591915921 }
1592015922
15921- if (!DataVar->IsImmediate() && NElts > 1)
15923+ if (!initWithElem0 || NElts != 1)
1592215924 {
15923- // Copy values over, the elements from NElts to ExecSz-1 are set to the first element
15924- // in the initialization above.
1592515925 emitVectorCopy(newVar, DataVar, NElts);
1592615926 }
1592715927 return newVar;
0 commit comments