@@ -13565,7 +13565,6 @@ CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode
1356513565// Reduction all expand helper: dst_lane{0..(simd-1)} = src_lane{0} OP src_lane{1}
1356613566void EmitPass::ReductionExpandHelper(e_opcode op, VISA_Type type, CVariable* src, CVariable* dst)
1356713567{
13568- const bool is64bitType = ScanReduceIs64BitType(type);
1356913568 const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
1357013569 const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
1357113570
@@ -13878,6 +13877,110 @@ void EmitPass::ReductionClusteredExpandHelper(e_opcode op, VISA_Type type, SIMDM
1387813877 }
1387913878}
1388013879
13880+ void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, CVariable* dst )
13881+ {
13882+ const bool isInt64Mul = ScanReduceIsInt64Mul( op, type );
13883+ const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded( op, type );
13884+
13885+ uint16_t srcElementCount = src->GetNumberElement(); // total elements in reduction tree
13886+ uint16_t reductionElementCount = srcElementCount / dst->GetNumberElement(); // number of elements participating per reduction
13887+ // Build reduction tree layers
13888+ while( srcElementCount > dst->GetNumberElement() )
13889+ {
13890+ // Each layer operation merges multiple separate reduction intermediary steps
13891+ // Calculate max lanes per operation and number of merged reduction operations for current layer
13892+ SIMDMode maxSimdMode = ( m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance > 1 ) ? SIMDMode::SIMD16 : m_currShader->m_dispatchSize;
13893+ SIMDMode layerMaxSimdMode = lanesToSIMDMode( min( numLanes( maxSimdMode ), (uint16_t)( srcElementCount >> 1 ) ) );
13894+ uint16_t layerMaxSimdLanes = numLanes( layerMaxSimdMode );
13895+ uint16_t src1Offset = reductionElementCount >> 1;
13896+ unsigned int numIterations = srcElementCount / ( 2 * layerMaxSimdLanes ); // number of reduction operations for current layer
13897+ for( unsigned int i = 0; i < numIterations; i++ )
13898+ {
13899+ // Get alias for src0, src1, and dst based on offsets and SIMD size
13900+ auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13901+ auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13902+ auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13903+
13904+ if( !int64EmulationNeeded )
13905+ {
13906+ m_encoder->SetNoMask();
13907+ m_encoder->SetSimdSize( layerMaxSimdMode );
13908+ // Set up correct vertical stride and width
13909+ m_encoder->SetSrcRegion( 0, reductionElementCount, ( reductionElementCount >> 1 ), 1 );
13910+ m_encoder->SetSrcRegion( 1, reductionElementCount, ( reductionElementCount >> 1 ), 1 );
13911+ m_encoder->GenericAlu( op, layerDst, layerSrc0, layerSrc1 );
13912+ m_encoder->Push();
13913+ }
13914+ else
13915+ {
13916+ if( isInt64Mul )
13917+ {
13918+ CVariable* tempMulSrc[ 2 ] = { layerSrc0, layerSrc1 };
13919+ Mul64( layerDst, tempMulSrc, layerMaxSimdMode, true /*noMask*/ );
13920+ }
13921+ else
13922+ {
13923+ IGC_ASSERT_MESSAGE( 0, "Unsupported" );
13924+ }
13925+ }
13926+ }
13927+
13928+ // Layer complete, total numer of elements and number of elements participating per reduction halved
13929+ srcElementCount >>= 1;
13930+ reductionElementCount >>= 1;
13931+ }
13932+
13933+ // copy fully reduced elements from src to dst
13934+ auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
13935+ m_encoder->SetNoMask();
13936+ m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
13937+ m_encoder->Copy( dst, finalLayerDst );
13938+ m_encoder->Push();
13939+ }
13940+
13941+ // Recursive function that emits one or more joint reduction trees based on the joint output width
13942+ void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMode, CVariable* src, CVariable* dst, unsigned int startIdx, unsigned int endIdx )
13943+ {
13944+ unsigned int numGroups = endIdx - startIdx + 1;
13945+ // lanes for final joint reduction
13946+ uint16_t simdLanes = numLanes( simdMode );
13947+ if( numGroups >= simdLanes )
13948+ {
13949+ // Do full tree reduction
13950+ unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
13951+ unsigned int groupReductionElementCount = reductionElements * simdLanes;
13952+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
13953+ CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
13954+ emitReductionTree( op, type, srcAlias, dstAlias );
13955+ // Start new recursive tree if any elements are left
13956+ if ( numGroups > simdLanes )
13957+ {
13958+ emitReductionTrees( op, type, simdMode, src, dst, startIdx + simdLanes, endIdx );
13959+ }
13960+ }
13961+ else
13962+ {
13963+ // Overshoot, try lower SIMD for the final reduction op
13964+ // TODO: Instead of trying lower SIMD, could generate simdLanes wide final join instruction, and pass in identity/0/don't care values for unused joins
13965+ // However, this will require a change to WaveAllJointReduction to generate intrinsic calls with fixed vector width to ensure the vector source variable used is generated with the proper bounds
13966+ // or logic to copy the vector source variable to a simdLane * simdLane sized variable along with logic to generate only the necessary operation on that varaible
13967+ switch( simdMode )
13968+ {
13969+ case SIMDMode::SIMD32:
13970+ return emitReductionTrees( op, type, SIMDMode::SIMD16, src, dst, startIdx, endIdx );
13971+ case SIMDMode::SIMD16:
13972+ return emitReductionTrees( op, type, SIMDMode::SIMD8, src, dst, startIdx, endIdx );
13973+ case SIMDMode::SIMD8:
13974+ return emitReductionTrees( op, type, SIMDMode::SIMD4, src, dst, startIdx, endIdx );
13975+ case SIMDMode::SIMD4:
13976+ return emitReductionTrees( op, type, SIMDMode::SIMD2, src, dst, startIdx, endIdx );
13977+ case SIMDMode::SIMD2:
13978+ default:
13979+ return emitReductionTrees( op, type, SIMDMode::SIMD1, src, dst, startIdx, endIdx );
13980+ }
13981+ }
13982+ }
13983+
1388113984// do reduction and accumulate all the activate channels, return a uniform
1388213985void EmitPass::emitReductionAll(
1388313986 e_opcode op, uint64_t identityValue, VISA_Type type, bool negate, CVariable* src, CVariable* dst)
@@ -13893,8 +13996,6 @@ void EmitPass::emitReductionAll(
1389313996 }
1389413997 else
1389513998 {
13896- const SIMDMode simd = SIMDMode::SIMD16;
13897-
1389813999 CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
1389914000 src, nullptr /* dst */);
1390014001
@@ -21891,15 +21992,86 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
2189121992 {
2189221993 ForceDMask();
2189321994 }
21995+ m_encoder->SetSubSpanDestination( false );
2189421996 CVariable* src = GetSymbol(inst->getOperand(0));
21997+ CVariable* dst = m_destination;
2189521998 const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
2189621999 VISA_Type type;
2189722000 e_opcode opCode;
2189822001 uint64_t identity = 0;
21899- GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
21900- CVariable* dst = m_destination;
21901- m_encoder->SetSubSpanDestination(false);
21902- emitReductionAll(opCode, identity, type, false, src, dst);
22002+ if( inst->getOperand( 0 )->getType()->isVectorTy() )
22003+ {
22004+ // Joint Reduction optimzation from multiple consecutive independent wave ops, can construct wider reduction tree
22005+ GetReductionOp( op, cast<VectorType>( inst->getOperand( 0 )->getType() )->getElementType(), identity, opCode, type );
22006+
22007+ if( m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance > 1 )
22008+ {
22009+ // Dual SIMD16 mode, use 1 SIMD16 inst per reduction for first layer to reduce 32 elements down to 16
22010+ CVariable* reduceSrc = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrc_" ), src->getName().getCString() ) );
22011+ CVariable* reduceSrcSecondHalf = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrcSecondHalf_" ), src->getName().getCString() ) );
22012+
22013+ const bool isInt64Mul = ScanReduceIsInt64Mul( opCode, type );
22014+ const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded( opCode, type );
22015+
22016+ // Explicitly generate First layer (Technically 0th layer since no operations are joint yet, we are still operating within a single reduction op)
22017+ for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
22018+ {
22019+ // Prepare reduceSrc
22020+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22021+ CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22022+ ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22023+
22024+ // Prepare reduceSrcSecondHalf
22025+ CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22026+ CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22027+ ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
22028+
22029+ // Emit correct operations
22030+ if( !int64EmulationNeeded )
22031+ {
22032+ m_encoder->SetNoMask();
22033+ m_encoder->SetSimdSize( SIMDMode::SIMD16 );
22034+ m_encoder->GenericAlu( opCode, reduceSrcAlias, reduceSrcAlias, reduceSrcSecondHalfAlias );
22035+ m_encoder->Push();
22036+ }
22037+ else
22038+ {
22039+ if( isInt64Mul )
22040+ {
22041+ CVariable* tmpMulSrc[ 2 ] = { reduceSrcAlias, reduceSrcSecondHalfAlias };
22042+ Mul64( reduceSrcAlias, tmpMulSrc, SIMDMode::SIMD16, true );
22043+ }
22044+ else
22045+ {
22046+ IGC_ASSERT_MESSAGE( 0, "Unsupported" );
22047+ }
22048+ }
22049+ }
22050+
22051+ // Now that 32 elements per reduction have been reduced to 16 in layer 0, can proceed with regular reduction tree implementation using SIMD16
22052+ emitReductionTrees( opCode, type, SIMDMode::SIMD16, reduceSrc, dst, 0, dst->GetNumberElement() - 1 );
22053+ }
22054+ else
22055+ {
22056+ CVariable* reduceSrc = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrc_" ), src->getName().getCString() ) );
22057+ // Prepare reduceSrc for all elements
22058+ for( int i = 0; i < dst->GetNumberElement(); i++ )
22059+ {
22060+ CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22061+ CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22062+ ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22063+ }
22064+
22065+ emitReductionTrees( opCode, type, m_currShader->m_dispatchSize, reduceSrc, dst, 0, dst->GetNumberElement() - 1 );
22066+ }
22067+ }
22068+ else
22069+ {
22070+ // Single WaveAll, emit base reduction tree
22071+ GetReductionOp( op, inst->getOperand( 0 )->getType(), identity, opCode, type );
22072+ emitReductionAll( opCode, identity, type, false, src, dst );
22073+ }
22074+
2190322075 if (disableHelperLanes)
2190422076 {
2190522077 ResetVMask();
0 commit comments