Skip to content

Commit be108bd

Browse files
bowenxue-inteligcbot
authored andcommitted
WaveAllJointReduction Optimization
Merge multiple consecutive WaveAll operations into a joint reduction tree
1 parent 1037a76 commit be108bd

File tree

13 files changed

+888
-7
lines changed

13 files changed

+888
-7
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 179 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13565,7 +13565,6 @@ CVariable* EmitPass::ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode
1356513565
// Reduction all expand helper: dst_lane{0..(simd-1)} = src_lane{0} OP src_lane{1}
1356613566
void EmitPass::ReductionExpandHelper(e_opcode op, VISA_Type type, CVariable* src, CVariable* dst)
1356713567
{
13568-
const bool is64bitType = ScanReduceIs64BitType(type);
1356913568
const bool isInt64Mul = ScanReduceIsInt64Mul(op, type);
1357013569
const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded(op, type);
1357113570

@@ -13878,6 +13877,110 @@ void EmitPass::ReductionClusteredExpandHelper(e_opcode op, VISA_Type type, SIMDM
1387813877
}
1387913878
}
1388013879

13880+
void EmitPass::emitReductionTree( e_opcode op, VISA_Type type, CVariable* src, CVariable* dst )
13881+
{
13882+
const bool isInt64Mul = ScanReduceIsInt64Mul( op, type );
13883+
const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded( op, type );
13884+
13885+
uint16_t srcElementCount = src->GetNumberElement(); // total elements in reduction tree
13886+
uint16_t reductionElementCount = srcElementCount / dst->GetNumberElement(); // number of elements participating per reduction
13887+
// Build reduction tree layers
13888+
while( srcElementCount > dst->GetNumberElement() )
13889+
{
13890+
// Each layer operation merges multiple separate reduction intermediary steps
13891+
// Calculate max lanes per operation and number of merged reduction operations for current layer
13892+
SIMDMode maxSimdMode = ( m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance > 1 ) ? SIMDMode::SIMD16 : m_currShader->m_dispatchSize;
13893+
SIMDMode layerMaxSimdMode = lanesToSIMDMode( min( numLanes( maxSimdMode ), (uint16_t)( srcElementCount >> 1 ) ) );
13894+
uint16_t layerMaxSimdLanes = numLanes( layerMaxSimdMode );
13895+
uint16_t src1Offset = reductionElementCount >> 1;
13896+
unsigned int numIterations = srcElementCount / ( 2 * layerMaxSimdLanes ); // number of reduction operations for current layer
13897+
for( unsigned int i = 0; i < numIterations; i++ )
13898+
{
13899+
// Get alias for src0, src1, and dst based on offsets and SIMD size
13900+
auto* layerSrc0 = m_currShader->GetNewAlias( src, type, i * 2 * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13901+
auto* layerSrc1 = m_currShader->GetNewAlias( src, type, ( i * 2 * layerMaxSimdLanes + src1Offset ) * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13902+
auto* layerDst = m_currShader->GetNewAlias( src, type, i * layerMaxSimdLanes * m_encoder->GetCISADataTypeSize( type ), layerMaxSimdLanes );
13903+
13904+
if( !int64EmulationNeeded )
13905+
{
13906+
m_encoder->SetNoMask();
13907+
m_encoder->SetSimdSize( layerMaxSimdMode );
13908+
// Set up correct vertical stride and width
13909+
m_encoder->SetSrcRegion( 0, reductionElementCount, ( reductionElementCount >> 1 ), 1 );
13910+
m_encoder->SetSrcRegion( 1, reductionElementCount, ( reductionElementCount >> 1 ), 1 );
13911+
m_encoder->GenericAlu( op, layerDst, layerSrc0, layerSrc1 );
13912+
m_encoder->Push();
13913+
}
13914+
else
13915+
{
13916+
if( isInt64Mul )
13917+
{
13918+
CVariable* tempMulSrc[ 2 ] = { layerSrc0, layerSrc1 };
13919+
Mul64( layerDst, tempMulSrc, layerMaxSimdMode, true /*noMask*/ );
13920+
}
13921+
else
13922+
{
13923+
IGC_ASSERT_MESSAGE( 0, "Unsupported" );
13924+
}
13925+
}
13926+
}
13927+
13928+
// Layer complete, total numer of elements and number of elements participating per reduction halved
13929+
srcElementCount >>= 1;
13930+
reductionElementCount >>= 1;
13931+
}
13932+
13933+
// copy fully reduced elements from src to dst
13934+
auto* finalLayerDst = m_currShader->GetNewAlias( src, type, 0, dst->GetNumberElement() );
13935+
m_encoder->SetNoMask();
13936+
m_encoder->SetSimdSize( lanesToSIMDMode( dst->GetNumberElement() ) );
13937+
m_encoder->Copy( dst, finalLayerDst );
13938+
m_encoder->Push();
13939+
}
13940+
13941+
// Recursive function that emits one or more joint reduction trees based on the joint output width
13942+
void EmitPass::emitReductionTrees( e_opcode op, VISA_Type type, SIMDMode simdMode, CVariable* src, CVariable* dst, unsigned int startIdx, unsigned int endIdx )
13943+
{
13944+
unsigned int numGroups = endIdx - startIdx + 1;
13945+
// lanes for final joint reduction
13946+
uint16_t simdLanes = numLanes( simdMode );
13947+
if( numGroups >= simdLanes )
13948+
{
13949+
// Do full tree reduction
13950+
unsigned int reductionElements = src->GetNumberElement() / dst->GetNumberElement();
13951+
unsigned int groupReductionElementCount = reductionElements * simdLanes;
13952+
CVariable* srcAlias = m_currShader->GetNewAlias( src, type, startIdx * reductionElements * m_encoder->GetCISADataTypeSize( type ), groupReductionElementCount );
13953+
CVariable* dstAlias = m_currShader->GetNewAlias( dst, type, startIdx * m_encoder->GetCISADataTypeSize( type ), simdLanes);
13954+
emitReductionTree( op, type, srcAlias, dstAlias );
13955+
// Start new recursive tree if any elements are left
13956+
if ( numGroups > simdLanes )
13957+
{
13958+
emitReductionTrees( op, type, simdMode, src, dst, startIdx + simdLanes, endIdx );
13959+
}
13960+
}
13961+
else
13962+
{
13963+
// Overshoot, try lower SIMD for the final reduction op
13964+
// TODO: Instead of trying lower SIMD, could generate simdLanes wide final join instruction, and pass in identity/0/don't care values for unused joins
13965+
// However, this will require a change to WaveAllJointReduction to generate intrinsic calls with fixed vector width to ensure the vector source variable used is generated with the proper bounds
13966+
// or logic to copy the vector source variable to a simdLane * simdLane sized variable along with logic to generate only the necessary operation on that varaible
13967+
switch( simdMode )
13968+
{
13969+
case SIMDMode::SIMD32:
13970+
return emitReductionTrees( op, type, SIMDMode::SIMD16, src, dst, startIdx, endIdx );
13971+
case SIMDMode::SIMD16:
13972+
return emitReductionTrees( op, type, SIMDMode::SIMD8, src, dst, startIdx, endIdx );
13973+
case SIMDMode::SIMD8:
13974+
return emitReductionTrees( op, type, SIMDMode::SIMD4, src, dst, startIdx, endIdx );
13975+
case SIMDMode::SIMD4:
13976+
return emitReductionTrees( op, type, SIMDMode::SIMD2, src, dst, startIdx, endIdx );
13977+
case SIMDMode::SIMD2:
13978+
default:
13979+
return emitReductionTrees( op, type, SIMDMode::SIMD1, src, dst, startIdx, endIdx );
13980+
}
13981+
}
13982+
}
13983+
1388113984
// do reduction and accumulate all the activate channels, return a uniform
1388213985
void EmitPass::emitReductionAll(
1388313986
e_opcode op, uint64_t identityValue, VISA_Type type, bool negate, CVariable* src, CVariable* dst)
@@ -13893,8 +13996,6 @@ void EmitPass::emitReductionAll(
1389313996
}
1389413997
else
1389513998
{
13896-
const SIMDMode simd = SIMDMode::SIMD16;
13897-
1389813999
CVariable* srcH2 = ScanReducePrepareSrc(type, identityValue, negate, true /* secondHalf */,
1389914000
src, nullptr /* dst */);
1390014001

@@ -21891,15 +21992,86 @@ void EmitPass::emitWaveAll(llvm::GenIntrinsicInst* inst)
2189121992
{
2189221993
ForceDMask();
2189321994
}
21995+
m_encoder->SetSubSpanDestination( false );
2189421996
CVariable* src = GetSymbol(inst->getOperand(0));
21997+
CVariable* dst = m_destination;
2189521998
const WaveOps op = static_cast<WaveOps>(cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue());
2189621999
VISA_Type type;
2189722000
e_opcode opCode;
2189822001
uint64_t identity = 0;
21899-
GetReductionOp(op, inst->getOperand(0)->getType(), identity, opCode, type);
21900-
CVariable* dst = m_destination;
21901-
m_encoder->SetSubSpanDestination(false);
21902-
emitReductionAll(opCode, identity, type, false, src, dst);
22002+
if( inst->getOperand( 0 )->getType()->isVectorTy() )
22003+
{
22004+
// Joint Reduction optimzation from multiple consecutive independent wave ops, can construct wider reduction tree
22005+
GetReductionOp( op, cast<VectorType>( inst->getOperand( 0 )->getType() )->getElementType(), identity, opCode, type );
22006+
22007+
if( m_currShader->m_dispatchSize == SIMDMode::SIMD32 && m_currShader->m_numberInstance > 1 )
22008+
{
22009+
// Dual SIMD16 mode, use 1 SIMD16 inst per reduction for first layer to reduce 32 elements down to 16
22010+
CVariable* reduceSrc = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrc_" ), src->getName().getCString() ) );
22011+
CVariable* reduceSrcSecondHalf = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrcSecondHalf_" ), src->getName().getCString() ) );
22012+
22013+
const bool isInt64Mul = ScanReduceIsInt64Mul( opCode, type );
22014+
const bool int64EmulationNeeded = ScanReduceIsInt64EmulationNeeded( opCode, type );
22015+
22016+
// Explicitly generate First layer (Technically 0th layer since no operations are joint yet, we are still operating within a single reduction op)
22017+
for( uint16_t i = 0; i < dst->GetNumberElement(); i++ )
22018+
{
22019+
// Prepare reduceSrc
22020+
CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22021+
CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22022+
ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22023+
22024+
// Prepare reduceSrcSecondHalf
22025+
CVariable* srcSecondHalfAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22026+
CVariable* reduceSrcSecondHalfAlias = m_currShader->GetNewAlias( reduceSrcSecondHalf, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22027+
ScanReducePrepareSrc( type, identity, false, true, srcSecondHalfAlias, reduceSrcSecondHalfAlias );
22028+
22029+
// Emit correct operations
22030+
if( !int64EmulationNeeded )
22031+
{
22032+
m_encoder->SetNoMask();
22033+
m_encoder->SetSimdSize( SIMDMode::SIMD16 );
22034+
m_encoder->GenericAlu( opCode, reduceSrcAlias, reduceSrcAlias, reduceSrcSecondHalfAlias );
22035+
m_encoder->Push();
22036+
}
22037+
else
22038+
{
22039+
if( isInt64Mul )
22040+
{
22041+
CVariable* tmpMulSrc[ 2 ] = { reduceSrcAlias, reduceSrcSecondHalfAlias };
22042+
Mul64( reduceSrcAlias, tmpMulSrc, SIMDMode::SIMD16, true );
22043+
}
22044+
else
22045+
{
22046+
IGC_ASSERT_MESSAGE( 0, "Unsupported" );
22047+
}
22048+
}
22049+
}
22050+
22051+
// Now that 32 elements per reduction have been reduced to 16 in layer 0, can proceed with regular reduction tree implementation using SIMD16
22052+
emitReductionTrees( opCode, type, SIMDMode::SIMD16, reduceSrc, dst, 0, dst->GetNumberElement() - 1 );
22053+
}
22054+
else
22055+
{
22056+
CVariable* reduceSrc = m_currShader->GetNewVariable( src->GetNumberElement(), type, src->GetAlign(), CName( CName( "reduceSrc_" ), src->getName().getCString() ) );
22057+
// Prepare reduceSrc for all elements
22058+
for( int i = 0; i < dst->GetNumberElement(); i++ )
22059+
{
22060+
CVariable* srcAlias = m_currShader->GetNewAlias( src, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22061+
CVariable* reduceSrcAlias = m_currShader->GetNewAlias( reduceSrc, type, i * numLanes( m_currShader->m_SIMDSize ) * m_encoder->GetCISADataTypeSize( type ), numLanes( m_currShader->m_SIMDSize ) );
22062+
ScanReducePrepareSrc( type, identity, false, false, srcAlias, reduceSrcAlias );
22063+
}
22064+
22065+
emitReductionTrees( opCode, type, m_currShader->m_dispatchSize, reduceSrc, dst, 0, dst->GetNumberElement() - 1 );
22066+
}
22067+
}
22068+
else
22069+
{
22070+
// Single WaveAll, emit base reduction tree
22071+
GetReductionOp( op, inst->getOperand( 0 )->getType(), identity, opCode, type );
22072+
emitReductionAll( opCode, identity, type, false, src, dst );
22073+
}
22074+
2190322075
if (disableHelperLanes)
2190422076
{
2190522077
ResetVMask();

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,19 @@ class EmitPass : public llvm::FunctionPass
320320
bool negate,
321321
CVariable* src,
322322
CVariable* dst);
323+
void emitReductionTree(
324+
e_opcode op,
325+
VISA_Type type,
326+
CVariable* src,
327+
CVariable* dst );
328+
void emitReductionTrees(
329+
e_opcode op,
330+
VISA_Type type,
331+
SIMDMode simdMode,
332+
CVariable* src,
333+
CVariable* dst,
334+
unsigned int startIdx,
335+
unsigned int endIdx );
323336
void emitReductionClustered(
324337
const e_opcode op,
325338
const uint64_t identityValue,

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ SPDX-License-Identifier: MIT
102102
#include "Compiler/Optimizer/BarrierControlFlowOptimization.hpp"
103103
#include "Compiler/Optimizer/RuntimeValueVectorExtractPass.h"
104104
#include "Compiler/Optimizer/WaveShuffleIndexSinking.hpp"
105+
#include "Compiler/Optimizer/WaveAllJointReduction.hpp"
105106
#include "Compiler/MetaDataApi/PurgeMetaDataUtils.hpp"
106107
#include "Compiler/HandleLoadStoreInstructions.hpp"
107108
#include "Compiler/CustomSafeOptPass.hpp"
@@ -1869,6 +1870,11 @@ void OptimizeIR(CodeGenContext* const pContext)
18691870

18701871
mpm.add(llvm::createDeadCodeEliminationPass());
18711872

1873+
if( IGC_IS_FLAG_ENABLED(EnableWaveAllJointReduction) )
1874+
{
1875+
mpm.add( createWaveAllJointReduction() );
1876+
}
1877+
18721878
if (IGC_IS_FLAG_ENABLED(EnableIntDivRemCombine)) {
18731879
// simplify rem if the quotient is availble
18741880
//

IGC/Compiler/InitializePasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ void initializeVectorBitCastOptPass(llvm::PassRegistry&);
203203
void initializeVectorPreProcessPass(llvm::PassRegistry&);
204204
void initializeVectorProcessPass(llvm::PassRegistry&);
205205
void initializeVerificationPassPass(llvm::PassRegistry&);
206+
void initializeWaveAllJointReductionPass(llvm::PassRegistry&);
206207
void initializeWGFuncResolutionPass(llvm::PassRegistry&);
207208
void initializeWIAnalysisPass(llvm::PassRegistry&);
208209
void initializeWIFuncResolutionPass(llvm::PassRegistry&);

IGC/Compiler/Optimizer/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ set(IGC_BUILD__SRC__Optimizer
3434
"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.cpp"
3535
"${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.cpp"
3636
"${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"
37+
"${CMAKE_CURRENT_SOURCE_DIR}/WaveAllJointReduction.cpp"
3738
)
3839

3940
set(IGC_BUILD__SRC__Compiler_Optimizer
@@ -61,6 +62,7 @@ set(IGC_BUILD__HDR__Optimizer
6162
"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.h"
6263
"${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.hpp"
6364
"${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"
65+
"${CMAKE_CURRENT_SOURCE_DIR}/WaveAllJointReduction.hpp"
6466
)
6567

6668
set(IGC_BUILD__HDR__Optimizer

0 commit comments

Comments
 (0)