@@ -943,7 +943,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
943943 for (uint i = 0; i < m_pattern->m_numBlocks; i++)
944944 {
945945 SBasicBlock& block = m_pattern->m_blocks[i];
946- block.m_activeMask = nullptr ; // clear for each SIMD size
946+ block.clearCaching() ; // clear for each SIMD size
947947 m_currentBlock = i;
948948 if (m_blockCoalescing->IsEmptyBlock(block.bb))
949949 {
@@ -975,6 +975,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
975975 while (I != E)
976976 {
977977 Instruction* llvmInst = I->m_root;
978+ resetCurrInstNumInstances();
979+
978980 if (llvmInst->getDebugLoc())
979981 {
980982 unsigned int curLineNumber = llvmInst->getDebugLoc().getLine();
@@ -1004,6 +1006,8 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10041006 bool slicing = false;
10051007 uint numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
10061008 IGC_ASSERT(numInstance == 1 || numInstance == 2);
1009+ // caching the number of instance
1010+ setCurrInstNumInstances(numInstance);
10071011
10081012 if (slicing && !disableSlicing)
10091013 {
@@ -1033,6 +1037,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
10331037 if (slicing)
10341038 {
10351039 numInstance = DecideInstanceAndSlice(*block.bb, *I, slicing);
1040+ setCurrInstNumInstances(numInstance);
10361041 }
10371042
10381043 if (llvmtoVISADump)
@@ -12327,32 +12332,93 @@ void EmitPass::emitScalarAtomics(
1232712332 uniformAtomicOp = EATOMIC_IADD;
1232812333 }
1232912334 bool returnsImmValue = (!pInst->use_empty());
12330- CVariable* pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12331- 1,
12332- type,
12333- isA64 ? EALIGN_2GRF : EALIGN_GRF,
12334- true,
12335- CName::NONE);
12335+ CVariable* pFinalAtomicSrcVal;
1233612336 CVariable* pSrcsArr[2] = { nullptr, nullptr };
12337- if (returnsImmValue)
12337+
12338+ if (op == EOPCODE_ADD && bitWidth == 32 && pSrc->IsUniform() &&
12339+ getCurrInstNumInstances() == 1 && !returnsImmValue)
1233812340 {
12339- // sum all the lanes
12340- emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12341+ // Special case for uniform DW src (like atomic_add(1) without return value.
12342+ // Note: limit this code for a single instance for now as scalar atomic must have
12343+ // instance = 1 (see DecideInstanceAndSlice()).
12344+ //
12345+ // The following sequence will be generated:
12346+ // (W) mov (16|M0) f0.0<1>:uw 0:uw
12347+ // cmp.eq.f0.0 (16|M0) dummy:uw dummy:uw
12348+ // (W) mov (1|M0) r2.0<1>:uw f0.0:uw
12349+ // (W) cbit (1|M0) r1.0:uw r2.0:uw <-- r1.0 : number of active lanes
12350+ // (W) mul (1|M0) r10:ud pSrc r1.0:uw
12351+ SBasicBlock& currBlk = getCurrentBlock();
12352+ CVariable* numActiveLanes = currBlk.m_numActiveLanes;
12353+ if (numActiveLanes == nullptr)
12354+ {
12355+ CVariable* emask = GetExecutionMask(); // execution mask for the entire dispatch size
12356+ // Count the number of '1' bits we have in the execmask to get the number of active lanes.
12357+ // For example, given emask = 1011011000100010b, numActiveLanes = 7
12358+ // This will handle cases in which not all lanes are active.
12359+ numActiveLanes = m_currShader->GetNewVariable(1, ISA_TYPE_W, EALIGN_DWORD, true, CName::NONE);
12360+ m_encoder->CBit(numActiveLanes, emask);
12361+ m_encoder->Push();
12362+
12363+ // save it for possible re-use later.
12364+ currBlk.m_numActiveLanes = numActiveLanes;
12365+ }
1234112366
12342- CVariable* pSrcCopy = pSrcsArr[0];
12343- if (m_currShader->m_numberInstance == 2)
12367+ // pFinalAtomicSrcVal is used in msg's payload and thus needs to be GRF-aligned
12368+ pFinalAtomicSrcVal = m_currShader->GetNewVariable(1, ISA_TYPE_D, EALIGN_GRF, true, CName::NONE);
12369+ if (pSrc->IsImmediate() && pSrc->GetImmediateValue() == 1)
1234412370 {
12345- pSrcCopy = pSrcsArr[1];
12371+ if (negateSrc)
12372+ {
12373+ m_encoder->SetSrcModifier(0, EMOD_NEG);
12374+ }
12375+ m_encoder->Cast(pFinalAtomicSrcVal, numActiveLanes);
12376+ m_encoder->Push();
1234612377 }
12378+ else
12379+ {
12380+ m_encoder->Mul(pFinalAtomicSrcVal, pSrc, numActiveLanes);
12381+ m_encoder->Push();
1234712382
12348- m_encoder->SetSrcRegion(0, 0, 1, 0);
12349- m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12350- m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12351- m_encoder->Push();
12383+ // using neg srcmod with mul will end up with more insts, thus using srcmod on mov
12384+ if (negateSrc)
12385+ {
12386+ m_encoder->SetSrcModifier(0, EMOD_NEG);
12387+ }
12388+ m_encoder->Copy(pFinalAtomicSrcVal, pFinalAtomicSrcVal);
12389+ m_encoder->Push();
12390+ }
1235212391 }
1235312392 else
1235412393 {
12355- emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12394+ // general case
12395+ pFinalAtomicSrcVal = m_currShader->GetNewVariable(
12396+ 1,
12397+ type,
12398+ isA64 ? EALIGN_2GRF : EALIGN_GRF,
12399+ true,
12400+ CName::NONE);
12401+
12402+ if (returnsImmValue)
12403+ {
12404+ // sum all the lanes
12405+ emitPreOrPostFixOp(op, identityValue, type, negateSrc, pSrc, pSrcsArr);
12406+
12407+ CVariable* pSrcCopy = pSrcsArr[0];
12408+ if (m_currShader->m_numberInstance == 2)
12409+ {
12410+ pSrcCopy = pSrcsArr[1];
12411+ }
12412+
12413+ m_encoder->SetSrcRegion(0, 0, 1, 0);
12414+ m_encoder->SetSrcSubReg(0, numLanes(m_currShader->m_SIMDSize) - 1);
12415+ m_encoder->Copy(pFinalAtomicSrcVal, pSrcCopy);
12416+ m_encoder->Push();
12417+ }
12418+ else
12419+ {
12420+ emitReductionAll(op, identityValue, type, negateSrc, pSrc, pFinalAtomicSrcVal);
12421+ }
1235612422 }
1235712423
1235812424 auto moveToReg = [&](CVariable*& pVar)
@@ -12388,11 +12454,6 @@ void EmitPass::emitScalarAtomics(
1238812454 m_encoder->SetSimdSize(SIMDMode::SIMD1);
1238912455 m_encoder->SetNoMask();
1239012456
12391- CVariable* pReturnVal = returnsImmValue ?
12392- m_currShader->GetNewVariable(
12393- 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12394- nullptr;
12395-
1239612457 if (bitWidth == 16)
1239712458 {
1239812459 CVariable* pCastAtomicSrcVal =
@@ -12402,6 +12463,11 @@ void EmitPass::emitScalarAtomics(
1240212463 pFinalAtomicSrcVal = pCastAtomicSrcVal;
1240312464 }
1240412465
12466+ CVariable* pReturnVal = returnsImmValue ?
12467+ m_currShader->GetNewVariable(
12468+ 1, ISA_TYPE_UD, EALIGN_GRF, true, CName::NONE) :
12469+ nullptr;
12470+
1240512471 if (shouldGenerateLSC(pInst))
1240612472 {
1240712473 m_encoder->LSC_AtomicRaw(
0 commit comments