@@ -15752,84 +15752,98 @@ void EmitPass::emitVectorStore(StoreInst* inst, Value* offset, ConstantInt* immO
1575215752// In addition, if 64bit add is not supported, emitAddPair() will be used to
1575315753// use 32bit add/addc to emulate 64bit add.
1575415754//
15755- // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is
15756- // its return var. The argument 'DataVar' in prepareDataForUniform() is uniform,
15757- // so is its return var.
15755+ // Note that argument 'AddrVar' in prepareAddressForUniform() is uniform, so is its return var.
15756+ // The argument 'DataVar' in prepareDataForUniform() is uniform, so is its return var.
1575815757//
1575915758CVariable* EmitPass::prepareAddressForUniform(
15760- CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t RequiredNElts , e_alignment Align)
15759+ CVariable* AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz , e_alignment Align)
1576115760{
15762- // If RequiredNElts == 0, use next power of 2 of NElts as return var's num of elements.
15763- // otherwise, user RequiredNElts as return var's num of elements.
15764- uint32_t pow2NElts = (uint32_t)PowerOf2Ceil(NElts);
15765- uint32_t allocNElts = (RequiredNElts > 0 ? RequiredNElts : pow2NElts);
1576615761 IGC_ASSERT(NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15767- IGC_ASSERT(allocNElts >= pow2NElts);
15768- if (allocNElts == NElts && AddrVar->IsGRFAligned(Align))
15762+ if (ExecSz == 1 && AddrVar->IsGRFAligned(Align))
1576915763 {
15770- // No need to create a new var.
1577115764 return AddrVar;
1577215765 }
1577315766 bool isA64 = (AddrVar->GetElemSize() == 8);
15774- SIMDMode simdmode = lanesToSIMDMode(pow2NElts );
15775- CVariable* newVar = m_currShader->GetNewVariable(allocNElts , AddrVar->GetType(), Align, true, CName::NONE);
15767+ SIMDMode simdmode = lanesToSIMDMode(ExecSz );
15768+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz , AddrVar->GetType(), Align, true, CName::NONE);
1577615769
1577715770 CVariable* off;
1577815771 uint32_t incImm = (0x76543210 & maskTrailingOnes<uint32_t>(NElts * 4));
15779- if ((pow2NElts <= 4 && EltBytes == 4) || (pow2NElts <= 2 && EltBytes == 8))
15772+ if ((ExecSz <= 4 && EltBytes == 4) || (ExecSz <= 2 && EltBytes == 8))
1578015773 {
1578115774 // This case needs a single UV immediate
1578215775 incImm = incImm << (EltBytes == 4 ? 2 : 3);
1578315776 off = m_currShader->ImmToVariable(incImm, ISA_TYPE_UV);
1578415777 }
1578515778 else
1578615779 {
15787- // Need a temporary var to calculate offsets.
15788- // (Note that the temp is non-uniform, otherwise emitAddrPair() won't work.)
15789- off = m_currShader->GetNewVariable(pow2NElts, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
15780+ // Need a temporary var to calculate offsets
15781+ off = m_currShader->GetNewVariable(ExecSz, ISA_TYPE_UD, EALIGN_DWORD, false, CName::NONE);
1579015782
15791- // Need a mov and mul
15792- m_encoder->SetNoMask();
15793- m_encoder->SetSimdSize(simdmode);
15794- m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15795- m_encoder->Push();
15783+ // actualES is the actual execsize used for computing offsets.
15784+ uint32_t actualES = (uint32_t)PowerOf2Ceil(NElts);
1579615785
15797- m_encoder->SetNoMask();
15798- m_encoder->SetSimdSize(simdmode);
15799- m_encoder->SetSrcRegion(0, 1, 1, 0);
15800- m_encoder->SetSrcRegion(1, 0, 1, 0);
15801- m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15802- m_encoder->Push();
15803- }
15786+ // incImm is UV type and can be used in execsize <= 8 only. If ExecSz is greater
15787+ // than the actual number of lanes (for example, 4GRF alignment case), the upper lanes
15788+ // beyond need to be zero'ed.
15789+ if (ExecSz > actualES)
15790+ {
15791+ // Need to zero the upper lanes.
15792+ m_encoder->SetNoMask();
15793+ m_encoder->SetSimdSize(simdmode);
15794+ m_encoder->Copy(off, m_currShader->ImmToVariable(0, ISA_TYPE_UD));
15795+ m_encoder->Push();
15796+ }
1580415797
15805- // Only need to initialize pow2NElts elements.
15806- if (allocNElts > pow2NElts)
15807- {
15808- newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
15798+ SIMDMode sm = lanesToSIMDMode(actualES);
15799+ if (incImm > 0 &&
15800+ ((actualES <= 4 && EltBytes == 4) || (actualES <= 2 && EltBytes == 8)))
15801+ {
15802+ // This case needs a single UV immediate
15803+ incImm = incImm << (EltBytes == 4 ? 2 : 3);
15804+
15805+ m_encoder->SetNoMask();
15806+ m_encoder->SetSimdSize(sm);
15807+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15808+ m_encoder->Push();
15809+ }
15810+ else if (incImm > 0)
15811+ {
15812+ // Need a mov and mul
15813+ m_encoder->SetNoMask();
15814+ m_encoder->SetSimdSize(sm);
15815+ m_encoder->Copy(off, m_currShader->ImmToVariable(incImm, ISA_TYPE_UV));
15816+ m_encoder->Push();
15817+
15818+ m_encoder->SetNoMask();
15819+ m_encoder->SetSimdSize(sm);
15820+ m_encoder->SetSrcRegion(0, 1, 1, 0);
15821+ m_encoder->SetSrcRegion(1, 0, 1, 0);
15822+ m_encoder->Mul(off, off, m_currShader->ImmToVariable(EltBytes, ISA_TYPE_UW));
15823+ m_encoder->Push();
15824+ }
1580915825 }
1581015826
15811- // Currently, it's impossible to split because of NElts <= 8. In the future, NElts
15812- // could be 32 and we could need to split.
15813- bool needSplit = ((pow2NElts * newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
15827+ // May need splitting for A64
15828+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
1581415829 if (needSplit)
1581515830 {
1581615831 IGC_ASSERT(!off->IsImmediate());
15817- uint32_t halfNElts = pow2NElts / 2;
15818- uint32_t bytes1 = halfNElts * newVar->GetElemSize();
15819- uint32_t bytes2 = halfNElts * off->GetElemSize();
15820- CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, halfNElts);
15821- CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, halfNElts);
15822- CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, halfNElts);
15823- CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, halfNElts);
15832+ uint32_t bytes1 = (ExecSz / 2) * newVar->GetElemSize();
15833+ uint32_t bytes2 = (ExecSz / 2) * off->GetElemSize();
15834+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, ExecSz / 2);
15835+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes1, ExecSz / 2);
15836+ CVariable* offHi = m_currShader->GetNewAlias(off, off->GetType(), 0, ExecSz / 2);
15837+ CVariable* offLo = m_currShader->GetNewAlias(off, off->GetType(), bytes2, ExecSz / 2);
1582415838
15825- if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
15839+ if (m_currShader->m_Platform->hasNoInt64Inst())
1582615840 {
1582715841 emitAddPair(newVarHi, AddrVar, offHi);
1582815842 emitAddPair(newVarLo, AddrVar, offLo);
1582915843 }
1583015844 else
1583115845 {
15832- SIMDMode sm = lanesToSIMDMode(halfNElts );
15846+ SIMDMode sm = lanesToSIMDMode(ExecSz / 2 );
1583315847 m_encoder->SetNoMask();
1583415848 m_encoder->SetUniformSIMDSize(sm);
1583515849 m_encoder->SetSrcRegion(0, 0, 1, 0);
@@ -15845,7 +15859,7 @@ CVariable* EmitPass::prepareAddressForUniform(
1584515859 m_encoder->Push();
1584615860 }
1584715861 }
15848- else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst() && pow2NElts > 1 )
15862+ else if (isA64 && m_currShader->m_Platform->hasNoInt64Inst())
1584915863 {
1585015864 emitAddPair(newVar, AddrVar, off);
1585115865 }
@@ -15855,73 +15869,59 @@ CVariable* EmitPass::prepareAddressForUniform(
1585515869 m_encoder->SetUniformSIMDSize(simdmode);
1585615870 m_encoder->SetSrcRegion(0, 0, 1, 0);
1585715871 m_encoder->SetSrcRegion(1, 1, 1, 0);
15858- if (pow2NElts > 1) {
15859- m_encoder->Add(newVar, AddrVar, off);
15860- }
15861- else {
15862- m_encoder->Copy(newVar, AddrVar);
15863- }
15872+ m_encoder->Add(newVar, AddrVar, off);
1586415873 m_encoder->Push();
1586515874 }
1586615875 return newVar;
1586715876}
1586815877
1586915878CVariable* EmitPass::prepareDataForUniform(
15870- CVariable* DataVar, uint32_t RequiredNElts , e_alignment Align)
15879+ CVariable* DataVar, uint32_t ExecSz , e_alignment Align)
1587115880{
1587215881 uint32_t NElts = DataVar->GetNumberElement();
1587315882 uint32_t EltBytes = DataVar->GetElemSize();
15874- uint32_t pow2NElts = (uint32_t)(uint32_t)PowerOf2Ceil(NElts);
15875- uint32_t allocNElts = RequiredNElts > 0 ? RequiredNElts : pow2NElts;
15876- IGC_ASSERT(allocNElts >= pow2NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15877- if (NElts == allocNElts && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
15883+ IGC_ASSERT(ExecSz >= NElts && NElts <= 8 && (EltBytes == 4 || EltBytes == 8));
15884+ if (NElts == ExecSz && !DataVar->IsImmediate() && DataVar->IsGRFAligned(Align))
1587815885 {
1587915886 return DataVar;
1588015887 }
15881- CVariable* newVar = m_currShader->GetNewVariable(allocNElts , DataVar->GetType(), Align, true, CName::NONE);
15888+ CVariable* newVar = m_currShader->GetNewVariable(ExecSz , DataVar->GetType(), Align, true, CName::NONE);
1588215889
15883- // Need to return a var with pow2NElts elements
15884- if (allocNElts > pow2NElts)
15890+ // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15891+ bool needSplit = (newVar->GetSize() > (2 * (uint32_t)getGRFSize()));
15892+ if (needSplit)
1588515893 {
15886- newVar = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, pow2NElts);
15887- }
15894+ uint32_t esz = ExecSz / 2;
15895+ uint32_t bytes = esz * newVar->GetElemSize();
15896+ CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15897+ CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
1588815898
15889- // Initialize to DataVar's first element (set Elts from NElts and up to the first element).
15890- bool initWithElem0 = (pow2NElts > NElts);
15891- bool needSplit = ((pow2NElts *newVar->GetElemSize()) > (2 * (uint32_t)getGRFSize()));
15892- if (initWithElem0)
15893- {
15894- if (needSplit)
15895- {
15896- uint32_t esz = pow2NElts / 2;
15897- uint32_t bytes = esz * newVar->GetElemSize();
15898- CVariable* newVarHi = m_currShader->GetNewAlias(newVar, newVar->GetType(), 0, esz);
15899- CVariable* newVarLo = m_currShader->GetNewAlias(newVar, newVar->GetType(), bytes, esz);
15899+ m_encoder->SetNoMask();
15900+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15901+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15902+ m_encoder->Copy(newVarHi, DataVar);
15903+ m_encoder->Push();
1590015904
15901- m_encoder->SetNoMask();
15902- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15903- m_encoder->SetSrcRegion(0, 0, 1, 0);
15904- m_encoder->Copy(newVarHi, DataVar);
15905- m_encoder->Push();
15905+ m_encoder->SetNoMask();
15906+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15907+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15908+ m_encoder->Copy(newVarLo, DataVar);
15909+ m_encoder->Push();
15910+ }
15911+ else
15912+ {
1590615913
15907- m_encoder->SetNoMask();
15908- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(esz));
15909- m_encoder->SetSrcRegion(0, 0, 1, 0);
15910- m_encoder->Copy(newVarLo, DataVar);
15911- m_encoder->Push();
15912- }
15913- else
15914- {
15915- m_encoder->SetNoMask();
15916- m_encoder->SetUniformSIMDSize(lanesToSIMDMode(pow2NElts));
15917- m_encoder->SetSrcRegion(0, 0, 1, 0);
15918- m_encoder->Copy(newVar, DataVar);
15919- m_encoder->Push();
15920- }
15914+ m_encoder->SetNoMask();
15915+ m_encoder->SetUniformSIMDSize(lanesToSIMDMode(ExecSz));
15916+ m_encoder->SetSrcRegion(0, 0, 1, 0);
15917+ m_encoder->Copy(newVar, DataVar);
15918+ m_encoder->Push();
1592115919 }
1592215920
15923- if (!initWithElem0 || NElts != 1)
15921+ if (!DataVar->IsImmediate() && NElts > 1)
1592415922 {
15923+ // Copy values over, the elements from NElts to ExecSz-1 are set to the first element
15924+ // in the initialization above.
1592515925 emitVectorCopy(newVar, DataVar, NElts);
1592615926 }
1592715927 return newVar;
0 commit comments