@@ -14188,10 +14188,19 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
1418814188        for( unsigned int i = 0; i < numIterations; i++ )
1418914189        {
1419014190            // Get alias for src0, src1, and dst based on offsets and SIMD size
14191-             auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14192-             auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14193-             auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
14194- 
14191+             auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14192+             auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14193+             CVariable* layerDst;
14194+             if( (srcElementCount >> 1 <= dst->GetNumberElement()) && (i + 1 == numIterations ))
14195+             {
14196+                 // Final layer, use destination of WaveAll vector intrinsic inst (passed in with correct offset)
14197+                 layerDst = dst;
14198+             }
14199+             else
14200+             {
14201+                 // Use src as workspace to store intermediate values
14202+                 layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes, false );
14203+             }
1419514204            if( !int64EmulationNeeded )
1419614205            {
1419714206                m_encoder->SetNoMask();
@@ -14220,13 +14229,6 @@ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, C
1422014229        srcElementCount >>= 1;
1422114230        reductionElementCount >>= 1;
1422214231    }
14223- 
14224-     // copy fully reduced elements from src to dst
14225-     auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
14226-     m_encoder->SetNoMask();
14227-     m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
14228-     m_encoder->Copy( dst, finalLayerDst );
14229-     m_encoder->Push();
1423014232}
1423114233
1423214234// Recursive function that emits one or more joint reduction trees based on the joint output width
@@ -14240,8 +14242,8 @@ void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMod
1424014242        // Do full tree reduction
1424114243        unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
1424214244        unsigned int groupReductionElementCount = reductionElements * simdLanes;
14243-         CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
14244-         CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
14245+         CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount, false  );
14246+         CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes, false );
1424514247        emitReductionTree( op, type, srcAlias, dstAlias );
1424614248        // Start new recursive tree if any elements are left
1424714249        if ( numGroups > simdLanes )
@@ -22559,13 +22561,13 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
2255922561            for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
2256022562            {
2256122563                // Prepare reduceSrc
22562-                 CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize )  );
22563-                 CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22564+                 CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
22565+                 CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false  );
2256422566                ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
2256522567
2256622568                // Prepare reduceSrcSecondHalf
22567-                 CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22568-                 CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize )  );
22569+                 CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false  );
22570+                 CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ), false );
2256922571                ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
2257022572
2257122573                // Emit correct operations
0 commit comments