Skip to content

Commit 5a62655

Browse files
jaladreipsigcbot
authored andcommitted
Improve (and fix) cross block load vectorization path
Cross block load vectorization works on an assumption: within a single block, we can preload the rstack data for multiple rayinfo calls without drastically increasing overall register pressure. This let's us cull a lot of sends (applications will usually cluster rayinfo calls within a single block). The first implementation was flawed though. It didn't take into account the following things: 1. Some instructions will write to the stack (like TraceRayInline). This will make the shadow copy stale. 2. RayInfo instructions will create their own blocks when lowered. This will affect basic block -> stack pointer mapping, creating more shadow copies and unnecessary loads. 1. is fixed by splitting the block after instructions that write to the stack. 2. is fixed by collecting ray info instructions first, assigning stack pointers to them, and only then lowering them.
1 parent e6a8ee6 commit 5a62655

File tree

2 files changed

+72
-34
lines changed

2 files changed

+72
-34
lines changed

IGC/AdaptorCommon/RayTracing/NewTraceRayInlineLoweringPass.cpp

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ void InlineRaytracing::EmitPreTraceRayFence(RTBuilder &IRB, Value *rqObject) {
324324

325325
void InlineRaytracing::LowerIntrinsics(Function &F) {
326326
SmallVector<RayQueryIntrinsicBase *> RQInstructions;
327+
SmallVector<RayQueryInfoIntrinsic *> RQInfoInstructions;
327328

328329
for (auto &I : instructions(F)) {
329330
if (isa<RayQueryIntrinsicBase>(&I))
@@ -366,6 +367,11 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
366367
data.CommittedDataLocation = IRB.getInt32(CommittedHit);
367368

368369
setPackedData(IRB, rqObject, data);
370+
371+
// for the cross-block optimization purposes, split basic block to avoid using stale shadow stack
372+
if (allowCrossBlockLoadVectorization())
373+
IRB.createTriangleFlow(IRB.getFalse(), RQI);
374+
369375
break;
370376
}
371377
case GenISAIntrinsic::GenISA_TraceRaySyncProceedHL: {
@@ -497,6 +503,10 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
497503
setPackedData(IRB, rqObject, data);
498504

499505

506+
// for the cross-block optimization purposes, split basic block to avoid using stale shadow stack
507+
if (allowCrossBlockLoadVectorization())
508+
IRB.createTriangleFlow(IRB.getFalse(), RQI);
509+
500510
RQI->replaceAllUsesWith(result);
501511
break;
502512
}
@@ -521,39 +531,9 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
521531
case GenISAIntrinsic::GenISA_TraceRayInlineCandidateType:
522532
RQI->replaceAllUsesWith(getPackedData(IRB, rqObject).CandidateType);
523533
break;
524-
case GenISAIntrinsic::GenISA_TraceRayInlineRayInfo: {
525-
526-
auto *I = cast<RayQueryInfoIntrinsic>(RQI);
527-
auto data = getPackedData(IRB, rqObject);
528-
auto *loadCommittedFromPotential = IRB.CreateICmpEQ(data.CommittedDataLocation, IRB.getInt32(PotentialHit),
529-
VALUE_NAME("loadCommittedInfoFromPotentialHit"));
530-
531-
auto *shaderTy = IRB.CreateSelect(loadCommittedFromPotential, IRB.getInt32(AnyHit),
532-
IRB.getInt32(I->isCommitted() ? ClosestHit : AnyHit));
533-
534-
switch (I->getInfoKind()) {
535-
default:
536-
I->replaceAllUsesWith(IRB.lowerRayInfo(getStackPtr(IRB, rqObject, true), I, shaderTy, std::nullopt));
537-
break;
538-
// leave this in for now, until we prove we don't need the hack anymore
539-
case GEOMETRY_INDEX: {
540-
bool specialPattern = false;
541-
if (I->isCommitted() && IGC_GET_FLAG_VALUE(ForceRTShortCircuitingOR)) {
542-
specialPattern = forceShortCurcuitingOR_CommittedGeomIdx(IRB, I);
543-
}
544-
545-
Value *leafType = IRB.getLeafType(getStackPtr(IRB, rqObject, true), IRB.getInt1(I->isCommitted()));
546-
Value *geoIndex = IRB.getGeometryIndex(
547-
getStackPtr(IRB, rqObject, true), I, leafType,
548-
IRB.getInt32(I->isCommitted() ? CallableShaderTypeMD::ClosestHit : CallableShaderTypeMD::AnyHit),
549-
!specialPattern);
550-
IGC_ASSERT_MESSAGE(I->getType()->isIntegerTy(), "Invalid geometryIndex type!");
551-
I->replaceAllUsesWith(geoIndex);
552-
break;
553-
}
554-
}
534+
case GenISAIntrinsic::GenISA_TraceRayInlineRayInfo:
535+
RQInfoInstructions.push_back(cast<RayQueryInfoIntrinsic>(RQI));
555536
break;
556-
}
557537
case GenISAIntrinsic::GenISA_TraceRayInlineCommitNonOpaqueTriangleHit: {
558538
auto data = getPackedData(IRB, rqObject);
559539
auto *notDone = IRB.CreateAnd({IRB.CreateICmpEQ(data.HasAcceptHitAndEndSearchFlag, IRB.getInt32(0)),
@@ -584,6 +564,11 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
584564
data.CommittedStatus = IRB.getInt32(RTStackFormat::COMMITTED_STATUS::COMMITTED_PROCEDURAL_PRIMITIVE_HIT);
585565

586566
setPackedData(IRB, rqObject, data);
567+
568+
// for the cross-block optimization purposes, split basic block to avoid using stale shadow stack
569+
if (allowCrossBlockLoadVectorization())
570+
IRB.createTriangleFlow(IRB.getFalse(), RQI);
571+
587572
break;
588573
}
589574
default:
@@ -592,6 +577,55 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
592577
}
593578
}
594579

580+
// first map every rayinfo instruction to a stack pointer
581+
// we do it this way because rayinfo lowering itself will produce blocks
582+
// so a 2-pass method will yield better results
583+
MapVector<RayQueryInfoIntrinsic *, RTBuilder::SyncStackPointerVal *> RQInfoStackMap;
584+
585+
for (auto *I : RQInfoInstructions) {
586+
587+
auto *convertRQHandleFromRQObject = cast<Instruction>(I->getQueryObjIndex());
588+
auto *rqObject = convertRQHandleFromRQObject->getOperand(0);
589+
IRB.SetInsertPoint(I);
590+
RQInfoStackMap.insert(std::make_pair(I, getStackPtr(IRB, rqObject, true)));
591+
}
592+
593+
// now we can actually lower rayinfo instructions
594+
for (const auto& [I, stackPtr] : RQInfoStackMap) {
595+
596+
IRB.SetInsertPoint(I);
597+
auto *convertRQHandleFromRQObject = cast<Instruction>(I->getQueryObjIndex());
598+
auto *rqObject = convertRQHandleFromRQObject->getOperand(0);
599+
auto data = getPackedData(IRB, rqObject);
600+
auto *loadCommittedFromPotential = IRB.CreateICmpEQ(data.CommittedDataLocation, IRB.getInt32(PotentialHit),
601+
VALUE_NAME("loadCommittedInfoFromPotentialHit"));
602+
603+
auto *shaderTy = IRB.CreateSelect(loadCommittedFromPotential, IRB.getInt32(AnyHit),
604+
IRB.getInt32(I->isCommitted() ? ClosestHit : AnyHit));
605+
606+
switch (I->getInfoKind()) {
607+
default:
608+
I->replaceAllUsesWith(IRB.lowerRayInfo(stackPtr, I, shaderTy, std::nullopt));
609+
break;
610+
// leave this in for now, until we prove we don't need the hack anymore
611+
case GEOMETRY_INDEX: {
612+
bool specialPattern = false;
613+
if (I->isCommitted() && IGC_GET_FLAG_VALUE(ForceRTShortCircuitingOR)) {
614+
specialPattern = forceShortCurcuitingOR_CommittedGeomIdx(IRB, I);
615+
}
616+
617+
Value *leafType = IRB.getLeafType(stackPtr, IRB.getInt1(I->isCommitted()));
618+
Value *geoIndex = IRB.getGeometryIndex(
619+
stackPtr, I, leafType,
620+
IRB.getInt32(I->isCommitted() ? CallableShaderTypeMD::ClosestHit : CallableShaderTypeMD::AnyHit),
621+
!specialPattern);
622+
IGC_ASSERT_MESSAGE(I->getType()->isIntegerTy(), "Invalid geometryIndex type!");
623+
I->replaceAllUsesWith(geoIndex);
624+
break;
625+
}
626+
}
627+
}
628+
595629
llvm::for_each(RQInstructions, [](RayQueryIntrinsicBase *I) {
596630
auto *RQHandle = cast<Instruction>(I->getQueryObjIndex());
597631
I->eraseFromParent();

IGC/AdaptorCommon/RayTracing/NewTraceRayInlineLoweringPass.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,15 @@ class InlineRaytracing : public AllocationLivenessAnalyzer {
143143
IRB.CreateStore(packedData, getAtIndexFromRayQueryObject(IRB, rqObject, 1));
144144
}
145145

146+
bool allowCrossBlockLoadVectorization() {
147+
148+
return IGC_IS_FLAG_ENABLED(UseCrossBlockLoadVectorizationForInlineRaytracing) && m_pCGCtx->m_retryManager.IsFirstTry();
149+
}
150+
146151
llvm::RTBuilder::SyncStackPointerVal *getStackPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject,
147152
bool allowXBlockVectorize = false) {
148153

149-
bool doXBlockVectorize =
150-
allowXBlockVectorize && IGC_IS_FLAG_ENABLED(UseCrossBlockLoadVectorizationForInlineRaytracing);
154+
bool doXBlockVectorize = allowCrossBlockLoadVectorization() && allowXBlockVectorize;
151155

152156
// scan the basic block for continuation intrinsics. we don't want to contribute to raytracing swstack
153157
if (doXBlockVectorize) {

0 commit comments

Comments
 (0)