From e518ddeea35c2dde3c55ae06a913d503710c148e Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Wed, 4 Dec 2024 19:45:46 +0000 Subject: [PATCH] [ARM] Test unroll behaviour on machines with low overhead branching Current behaviour is the single loop with fmul gets runtime unrolled by count of 4, with the loop remainder unrolled as the 3 for.body9.us.prol sections. This is quite a lot of compare and branch, negating the benefits of the low overhead loop mechanism. --- .../Transforms/LoopUnroll/ARM/lob-unroll.ll | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll new file mode 100644 index 0000000000000..b155f5d31045f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll @@ -0,0 +1,63 @@ +; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB + +; This test checks behaviour of loop unrolling on processors with low overhead branching available + +; LOB-CHECK-LABEL: for.body{{.*}}.prol +; LOB-COUNT-1: fmul fast float +; LOB-CHECK-LABEL: for.body{{.*}}.prol.1 +; LOB-COUNT-1: fmul fast float +; LOB-CHECK-LABEL: for.body{{.*}}.prol.2 +; LOB-COUNT-1: fmul fast float +; LOB-CHECK-LABEL: for.body{{.*}} +; LOB-COUNT-4: fmul fast float +; LOB-NOT: fmul fast float + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) +define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 { +entry: + %cmp46 = icmp sgt i32 %n, 0 + br i1 %cmp46, label %for.body, label %for.cond.cleanup + +for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body + %exitcond49.not = icmp eq i32 %add, %n + br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.cond.loopexit, %entry + ret void + +for.body: ; preds = %entry, %for.cond.loopexit + %k.047 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %entry ] + %add = add nuw nsw i32 %k.047, 1 + %cmp244 = icmp slt i32 %add, %n + br i1 %cmp244, label %for.cond6.preheader.lr.ph, label %for.cond.loopexit + +for.cond6.preheader.lr.ph: ; preds = %for.body + %invariant.gep = getelementptr float, ptr %pA, i32 %k.047 + br label %for.cond6.preheader.us + +for.cond6.preheader.us: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.cond6.preheader.lr.ph + %w.045.us = phi i32 [ %add, %for.cond6.preheader.lr.ph ], [ %inc19.us, %for.cond6.for.cond.cleanup8_crit_edge.us ] + %mul.us = mul nuw nsw i32 %w.045.us, %n + %0 = getelementptr float, ptr %pA, i32 %mul.us + %arrayidx.us = getelementptr float, ptr %0, i32 %k.047 + br label %for.body9.us + +for.body9.us: ; preds = %for.cond6.preheader.us, %for.body9.us + %x.043.us = phi i32 [ %add, %for.cond6.preheader.us ], [ %inc.us, %for.body9.us ] + %1 = load float, ptr %arrayidx.us, align 4 + %mul11.us = mul nuw nsw i32 %x.043.us, %n + %gep.us = getelementptr float, ptr %invariant.gep, i32 %mul11.us + %2 = load float, ptr %gep.us, align 4 + %mul14.us = fmul fast float %2, %1 + %arrayidx17.us = getelementptr float, ptr %0, i32 %x.043.us + store float %mul14.us, ptr %arrayidx17.us, align 4 + %inc.us = add nuw nsw i32 %x.043.us, 1 + %exitcond.not = icmp eq i32 %inc.us, %n + br i1 %exitcond.not, label %for.cond6.for.cond.cleanup8_crit_edge.us, label %for.body9.us + +for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us + %inc19.us = add nuw nsw i32 %w.045.us, 1 + %exitcond48.not = icmp eq i32 %inc19.us, %n + br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us +} +