Skip to content

Commit 8626021

Browse files
ppogotovigcbot
authored andcommitted
Apply GepLSR with higher threshold to optimize i64 mul instructions.
GepLSR doesn't run when register pressure is above threshold. However, it can be reduced by subsequent passes. So this commit increases the threshold for cases where arithmetic with i64 mul instructions can be optimized by GepLSR.
1 parent d785e7c commit 8626021

File tree

2 files changed

+121
-13
lines changed

2 files changed

+121
-13
lines changed

IGC/Compiler/Optimizer/OpenCLPasses/GEPLoopStrengthReduction/GEPLoopStrengthReduction.cpp

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ struct Score
126126

127127
// Estimated increase in register pressure when reducing to loop's preheader.
128128
unsigned RegisterPressure;
129+
130+
// Flag to show presence of i64 mul instructions in the address calculation.
131+
bool ContainsMuli64;
129132
};
130133

131134

@@ -266,7 +269,7 @@ class Scorer
266269
void scoreReducedInstructions(ReductionCandidateGroup &Candidate);
267270
void scoreRegisterPressure(ReductionCandidateGroup &Candidate);
268271

269-
int estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP);
272+
int estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP, bool &ContainsMuli64);
270273
int estimatePointerAddition(ReductionCandidateGroup &Candidate);
271274

272275
const DataLayout &DL;
@@ -316,7 +319,7 @@ class RegisterPressureTracker
316319
private:
317320

318321
unsigned MaxAllowedPressure;
319-
unsigned ExternalPressure;
322+
unsigned FunctionExternalPressure;
320323

321324
IGCLivenessAnalysis &RPE;
322325
WIAnalysisRunner &WI;
@@ -764,15 +767,13 @@ void Scorer::scoreReducedInstructions(ReductionCandidateGroup &Candidate)
764767

765768
// Score "+ base_ptr"
766769
score += estimatePointerAddition(Candidate);
770+
bool ContainsMuli64 = false;
767771

768-
// Only need to deduce if reduction is net positive, no need to continue if already confirmed.
769-
if (score < 1)
770-
{
771-
// Score "index"
772-
score += estimateIndexInstructions(*Candidate.getLoop(), Cheapest.GEP);
773-
}
772+
// Score "index"
773+
score += estimateIndexInstructions(*Candidate.getLoop(), Cheapest.GEP, ContainsMuli64);
774774

775775
Candidate.Score.ReducesInstructions = score > 0;
776+
Candidate.Score.ContainsMuli64 = ContainsMuli64;
776777
}
777778

778779

@@ -805,7 +806,8 @@ int Scorer::estimatePointerAddition(ReductionCandidateGroup &Candidate)
805806
// Estimates how many instructions required to calculate index would be reduced to preheader.
806807
// This differs from checking SCEV expression size, which it might represent simplified index
807808
// calculation.
808-
int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP)
809+
// Sets ContainsMuli64 flag to show if i64 multiplication is present in the gep index calculation.
810+
int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP, bool &ContainsMuli64)
809811
{
810812
Instruction *Index = dyn_cast<Instruction>(*(GEP->operands().end() - 1));
811813
if (!Index)
@@ -837,6 +839,9 @@ int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP)
837839
instructions += 1;
838840
}
839841

842+
if (I->getOpcode() == Instruction::Mul && I->getType()->isIntegerTy(64))
843+
ContainsMuli64 = true;
844+
840845
for (auto *It = I->operands().begin(); It != I->operands().end(); ++It)
841846
{
842847
if (auto *Next = dyn_cast<Instruction>(It))
@@ -1193,7 +1198,7 @@ RegisterPressureTracker::RegisterPressureTracker(Function &F, CodeGenContext &CG
11931198
{
11941199
MaxAllowedPressure = static_cast<unsigned>(CGC.getNumGRFPerThread() * IGC_GET_FLAG_VALUE(GEPLSRThresholdRatio) / 100.0f);
11951200

1196-
ExternalPressure = FRPE.getExternalPressureForFunction(&F);
1201+
FunctionExternalPressure = FRPE.getExternalPressureForFunction(&F);
11971202
}
11981203

11991204

@@ -1218,14 +1223,31 @@ void RegisterPressureTracker::trackDeletedInstruction(Value *V)
12181223

12191224
bool RegisterPressureTracker::fitsPressureThreshold(ReductionCandidateGroup &C)
12201225
{
1221-
auto *F = C.getLoop()->getLoopPreheader()->getParent();
1226+
BasicBlock *Preheader = C.getLoop()->getLoopPreheader();
1227+
auto *F = Preheader->getParent();
12221228
uint SIMD = numLanes(RPE.bestGuessSIMDSize(F));
12231229

1224-
unsigned InitialPressure = ExternalPressure + RPE.getMaxRegCountForLoop(*C.getLoop(), SIMD, &WI);
1225-
unsigned EstimatedPressure = InitialPressure + C.getScore().RegisterPressure;
1230+
unsigned MaxLoopPressure = RPE.getMaxRegCountForLoop(*C.getLoop(), SIMD, &WI);
1231+
unsigned AdditionalPressure = C.getScore().RegisterPressure;
1232+
1233+
InsideBlockPressureMap BBListing;
1234+
RPE.collectPressureForBB(*Preheader, BBListing, SIMD, &WI);
1235+
unsigned LoopExternalPressureInBytes = BBListing[cast<Value>(Preheader->getTerminator())];
1236+
unsigned LoopExternalPressure = RPE.bytesToRegisters(LoopExternalPressureInBytes);
12261237

1238+
unsigned InitialPressure = FunctionExternalPressure + MaxLoopPressure;
1239+
unsigned EstimatedPressure = InitialPressure + AdditionalPressure;
1240+
1241+
// Try not to increase register pressure above threshold.
12271242
if (EstimatedPressure >= MaxAllowedPressure)
12281243
{
1244+
// Even if the optimization icnreases register pressure, apply it in case we can move mul i64 to preheader.
1245+
// This heuristic is based on the fact that mul i64 is expensive instruction and potential spills are generated out of the loop.
1246+
unsigned NewInternalLoopPressure = LoopExternalPressure - MaxLoopPressure + AdditionalPressure;
1247+
if (C.getScore().ContainsMuli64 && NewInternalLoopPressure < MaxAllowedPressure) {
1248+
return true;
1249+
}
1250+
12291251
LLVM_DEBUG(
12301252
dbgs() << " Estimated register pressure " << EstimatedPressure << " above threshold " << MaxAllowedPressure << "; can't fully reduce ";
12311253
C.print(dbgs());
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt -debugify --igc-gep-loop-strength-reduction -check-debugify -S < %s 2>&1 | FileCheck %s
11+
12+
; Reduced index is expressed with SCEVMulExpr.
13+
14+
; Debug-info related check
15+
; CHECK: CheckModuleDebugify: PASS
16+
17+
%"class.IntVector" = type { <1024 x i64> }
18+
19+
define spir_kernel void @test(i32 addrspace(1)* %p, i32 addrspace(1)* %t, i32 %k, i32 %n, i64 %multiplier, <1024 x i64> addrspace(1)* %otp) {
20+
entry:
21+
%_alloca = alloca %"class.IntVector", align 8
22+
%vecPtr = getelementptr %"class.IntVector", %"class.IntVector"* %_alloca, i32 0, i32 0
23+
store <1024 x i64> zeroinitializer, <1024 x i64>* %vecPtr, align 8
24+
%loadedVec = load <1024 x i64>, <1024 x i64>* %vecPtr, align 8
25+
%cmp1 = icmp slt i32 0, %n
26+
br i1 %cmp1, label %for.body.lr.ph, label %for.end
27+
28+
; CHECK-LABEL: for.body.lr.ph:
29+
; Check that GepLSR was applied to the gep index where i64 multiplication was used.
30+
; CHECK: [[MULL:%.*]] = mul i64 %multiplier, 44
31+
; CHECK: [[GEP_PHI1:%.*]] = getelementptr i32, i32 addrspace(1)* %p, i64 [[MULL]]
32+
; CHECK: [[STEP:%.*]] = shl i64 %multiplier, 1
33+
34+
; Check that GepLSR was NOT applied to the gep index where NO i64 multiplication was used.
35+
; CHECK-NOT: add i32 %k, -69
36+
; CHECK-NOT: getelementptr i32, i32 addrspace(1)* %t
37+
for.body.lr.ph: ; preds = %entry
38+
br label %for.body
39+
40+
; CHECK-LABEL: for.body:
41+
42+
; Check that GepLSR was applied to the gep index where i64 multiplication was used.
43+
; CHECK: [[GEP:%.*]] = phi i32 addrspace(1)* [ [[GEP_PHI1]], %for.body.lr.ph ], [ [[GEP_PHI2:%.*]], %for.body ]
44+
; CHECK: %i.02 = phi i32 [ 39, %for.body.lr.ph ], [ %inc, %for.body ]
45+
; CHECK: store i32 11, i32 addrspace(1)* [[GEP]], align 4
46+
47+
; Check that GepLSR was NOT applied to the gep index where NO i64 multiplication was used.
48+
; CHECK-NOT: getelementptr i32, i32 addrspace(1)* [[VAR:.*]], i64 -2
49+
; CHECK: %add1 = add nsw i32 %i.02, 30
50+
; CHECK: %sub1 = sub nsw i32 %k, %add1
51+
; CHECK: %idxprom1 = zext i32 %sub1 to i64
52+
; CHECK: %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %t, i64 %idxprom1
53+
54+
; CHECK: %inc = add nuw nsw i32 %i.02, 2
55+
; CHECK: %cmp = icmp slt i32 %inc, %n
56+
; CHECK: [[GEP_PHI2]] = getelementptr i32, i32 addrspace(1)* [[GEP]], i64 [[STEP]]
57+
; CHECK: br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
58+
for.body: ; preds = %for.body.lr.ph, %for.body
59+
%i.02 = phi i32 [ 39, %for.body.lr.ph ], [ %inc, %for.body ]
60+
%add = add nsw i32 %i.02, 5
61+
%zext = zext i32 %add to i64
62+
%idxprom = mul i64 %zext, %multiplier
63+
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %idxprom
64+
store i32 11, i32 addrspace(1)* %arrayidx, align 4
65+
%add1 = add nsw i32 %i.02, 30
66+
%sub1 = sub nsw i32 %k, %add1
67+
%idxprom1 = zext i32 %sub1 to i64
68+
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %t, i64 %idxprom1
69+
store i32 77, i32 addrspace(1)* %arrayidx1, align 4
70+
%inc = add nuw nsw i32 %i.02, 2
71+
%cmp = icmp slt i32 %inc, %n
72+
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
73+
74+
for.cond.for.end_crit_edge: ; preds = %for.body
75+
br label %for.end
76+
77+
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
78+
store <1024 x i64> %loadedVec, <1024 x i64> addrspace(1)* %otp, align 8
79+
ret void
80+
}
81+
82+
!igc.functions = !{!0}
83+
84+
!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, i64, <1024 x i64> addrspace(1)*)* @test, !1}
85+
!1 = !{!2}
86+
!2 = !{!"function_type", i32 0}

0 commit comments

Comments
 (0)