Apply GepLSR with higher threshold to optimize i64 mul instructions.

ppogotov · igcbot · commit 862602150ccf · 2025-01-17T22:08:07.000+01:00
GepLSR doesn't run when register pressure is above threshold. However, it can
be reduced by subsequent passes. So this commit increases the threshold for cases
where arithmetic with i64 mul instructions can be optimized by GepLSR.
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/GEPLoopStrengthReduction/GEPLoopStrengthReduction.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/GEPLoopStrengthReduction/GEPLoopStrengthReduction.cpp
@@ -126,6 +126,9 @@ struct Score
 
     // Estimated increase in register pressure when reducing to loop's preheader.
     unsigned RegisterPressure;
+
+    // Flag to show presence of i64 mul instructions in the address calculation.
+    bool ContainsMuli64;
 };
 
 
@@ -266,7 +269,7 @@ class Scorer
     void scoreReducedInstructions(ReductionCandidateGroup &Candidate);
     void scoreRegisterPressure(ReductionCandidateGroup &Candidate);
 
-    int estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP);
+    int estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP, bool &ContainsMuli64);
     int estimatePointerAddition(ReductionCandidateGroup &Candidate);
 
     const DataLayout &DL;
@@ -316,7 +319,7 @@ class RegisterPressureTracker
 private:
 
     unsigned MaxAllowedPressure;
-    unsigned ExternalPressure;
+    unsigned FunctionExternalPressure;
 
     IGCLivenessAnalysis &RPE;
     WIAnalysisRunner &WI;
@@ -764,15 +767,13 @@ void Scorer::scoreReducedInstructions(ReductionCandidateGroup &Candidate)
 
     // Score "+ base_ptr"
     score += estimatePointerAddition(Candidate);
+    bool ContainsMuli64 = false;
 
-    // Only need to deduce if reduction is net positive, no need to continue if already confirmed.
-    if (score < 1)
-    {
-        // Score "index"
-        score += estimateIndexInstructions(*Candidate.getLoop(), Cheapest.GEP);
-    }
+    // Score "index"
+    score += estimateIndexInstructions(*Candidate.getLoop(), Cheapest.GEP, ContainsMuli64);
 
     Candidate.Score.ReducesInstructions = score > 0;
+    Candidate.Score.ContainsMuli64 = ContainsMuli64;
 }
 
 
@@ -805,7 +806,8 @@ int Scorer::estimatePointerAddition(ReductionCandidateGroup &Candidate)
 // Estimates how many instructions required to calculate index would be reduced to preheader.
 // This differs from checking SCEV expression size, which it might represent simplified index
 // calculation.
-int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP)
+// Sets ContainsMuli64 flag to show if i64 multiplication is present in the gep index calculation.
+int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP, bool &ContainsMuli64)
 {
     Instruction *Index = dyn_cast<Instruction>(*(GEP->operands().end() - 1));
     if (!Index)
@@ -837,6 +839,9 @@ int Scorer::estimateIndexInstructions(const Loop &L, GetElementPtrInst *GEP)
             instructions += 1;
         }
 
+        if (I->getOpcode() == Instruction::Mul && I->getType()->isIntegerTy(64))
+            ContainsMuli64 = true;
+
         for (auto *It = I->operands().begin(); It != I->operands().end(); ++It)
         {
             if (auto *Next = dyn_cast<Instruction>(It))
@@ -1193,7 +1198,7 @@ RegisterPressureTracker::RegisterPressureTracker(Function &F, CodeGenContext &CG
 {
     MaxAllowedPressure = static_cast<unsigned>(CGC.getNumGRFPerThread() * IGC_GET_FLAG_VALUE(GEPLSRThresholdRatio) / 100.0f);
 
-    ExternalPressure = FRPE.getExternalPressureForFunction(&F);
+    FunctionExternalPressure = FRPE.getExternalPressureForFunction(&F);
 }
 
 
@@ -1218,14 +1223,31 @@ void RegisterPressureTracker::trackDeletedInstruction(Value *V)
 
 bool RegisterPressureTracker::fitsPressureThreshold(ReductionCandidateGroup &C)
 {
-    auto *F = C.getLoop()->getLoopPreheader()->getParent();
+    BasicBlock *Preheader = C.getLoop()->getLoopPreheader();
+    auto *F = Preheader->getParent();
     uint SIMD = numLanes(RPE.bestGuessSIMDSize(F));
 
-    unsigned InitialPressure = ExternalPressure + RPE.getMaxRegCountForLoop(*C.getLoop(), SIMD, &WI);
-    unsigned EstimatedPressure = InitialPressure + C.getScore().RegisterPressure;
+    unsigned MaxLoopPressure = RPE.getMaxRegCountForLoop(*C.getLoop(), SIMD, &WI);
+    unsigned AdditionalPressure = C.getScore().RegisterPressure;
+
+    InsideBlockPressureMap BBListing;
+    RPE.collectPressureForBB(*Preheader, BBListing, SIMD, &WI);
+    unsigned LoopExternalPressureInBytes = BBListing[cast<Value>(Preheader->getTerminator())];
+    unsigned LoopExternalPressure = RPE.bytesToRegisters(LoopExternalPressureInBytes);
 
+    unsigned InitialPressure = FunctionExternalPressure + MaxLoopPressure;
+    unsigned EstimatedPressure = InitialPressure + AdditionalPressure;
+
+    // Try not to increase register pressure above threshold.
     if (EstimatedPressure >= MaxAllowedPressure)
     {
+        // Even if the optimization icnreases register pressure, apply it in case we can move mul i64 to preheader.
+        // This heuristic is based on the fact that mul i64 is expensive instruction and potential spills are generated out of the loop.
+        unsigned NewInternalLoopPressure = LoopExternalPressure - MaxLoopPressure + AdditionalPressure;
+        if (C.getScore().ContainsMuli64 && NewInternalLoopPressure < MaxAllowedPressure) {
+            return true;
+        }
+
         LLVM_DEBUG(
             dbgs() << "  Estimated register pressure " << EstimatedPressure << " above threshold " << MaxAllowedPressure << "; can't fully reduce ";
             C.print(dbgs());
diff --git a/IGC/Compiler/tests/GEPLoopStrengthReduction/optimize-mulexpr-with-high-regpressure.ll b/IGC/Compiler/tests/GEPLoopStrengthReduction/optimize-mulexpr-with-high-regpressure.ll
@@ -0,0 +1,86 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; REQUIRES: regkeys
+; RUN: igc_opt -debugify --igc-gep-loop-strength-reduction -check-debugify -S < %s 2>&1 | FileCheck %s
+
+; Reduced index is expressed with SCEVMulExpr.
+
+; Debug-info related check
+; CHECK: CheckModuleDebugify: PASS
+
+%"class.IntVector" = type { <1024 x i64> }
+
+define spir_kernel void @test(i32 addrspace(1)* %p, i32 addrspace(1)* %t, i32 %k, i32 %n, i64 %multiplier, <1024 x i64> addrspace(1)* %otp)  {
+entry:
+  %_alloca = alloca %"class.IntVector", align 8
+  %vecPtr = getelementptr %"class.IntVector", %"class.IntVector"* %_alloca, i32 0, i32 0
+  store <1024 x i64> zeroinitializer, <1024 x i64>* %vecPtr, align 8
+  %loadedVec = load <1024 x i64>, <1024 x i64>* %vecPtr, align 8
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+; CHECK-LABEL: for.body.lr.ph:
+; Check that GepLSR was applied to the gep index where i64 multiplication was used.
+; CHECK:         [[MULL:%.*]] = mul i64 %multiplier, 44
+; CHECK:         [[GEP_PHI1:%.*]] = getelementptr i32, i32 addrspace(1)* %p, i64 [[MULL]]
+; CHECK:         [[STEP:%.*]] = shl i64 %multiplier, 1
+
+; Check that GepLSR was NOT applied to the gep index where NO i64 multiplication was used.
+; CHECK-NOT:     add i32 %k, -69
+; CHECK-NOT:     getelementptr i32, i32 addrspace(1)* %t
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+; CHECK-LABEL: for.body:
+
+; Check that GepLSR was applied to the gep index where i64 multiplication was used.
+; CHECK:         [[GEP:%.*]] = phi i32 addrspace(1)* [ [[GEP_PHI1]], %for.body.lr.ph ], [ [[GEP_PHI2:%.*]], %for.body ]
+; CHECK:         %i.02 = phi i32 [ 39, %for.body.lr.ph ], [ %inc, %for.body ]
+; CHECK:         store i32 11, i32 addrspace(1)* [[GEP]], align 4
+
+; Check that GepLSR was NOT applied to the gep index where NO i64 multiplication was used.
+; CHECK-NOT:     getelementptr i32, i32 addrspace(1)* [[VAR:.*]], i64 -2
+; CHECK:         %add1 = add nsw i32 %i.02, 30
+; CHECK:         %sub1 = sub nsw i32 %k, %add1
+; CHECK:         %idxprom1 = zext i32 %sub1 to i64
+; CHECK:         %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %t, i64 %idxprom1
+
+; CHECK:         %inc = add nuw nsw i32 %i.02, 2
+; CHECK:         %cmp = icmp slt i32 %inc, %n
+; CHECK:         [[GEP_PHI2]] = getelementptr i32, i32 addrspace(1)* [[GEP]], i64 [[STEP]]
+; CHECK:         br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.02 = phi i32 [ 39, %for.body.lr.ph ], [ %inc, %for.body ]
+  %add = add nsw i32 %i.02, 5
+  %zext = zext i32 %add to i64
+  %idxprom = mul i64 %zext, %multiplier
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %p, i64 %idxprom
+  store i32 11, i32 addrspace(1)* %arrayidx, align 4
+  %add1 = add nsw i32 %i.02, 30
+  %sub1 = sub nsw i32 %k, %add1
+  %idxprom1 = zext i32 %sub1 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %t, i64 %idxprom1
+  store i32 77, i32 addrspace(1)* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.02, 2
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  store <1024 x i64> %loadedVec, <1024 x i64> addrspace(1)* %otp, align 8
+  ret void
+}
+
+!igc.functions = !{!0}
+
+!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, i64, <1024 x i64> addrspace(1)*)* @test, !1}
+!1 = !{!2}
+!2 = !{!"function_type", i32 0}