From 064f5077efd4a8bedddf6eca0f776eae7a4fec6d Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Fri, 11 Oct 2024 16:14:14 +0100 Subject: [PATCH 1/2] [ARM] Prefer MUL to MULS on some implementations MULS adversely affects performance on many implementations. Where this is the case, we prefer not to shrink MUL to MULS. --- llvm/lib/Target/ARM/ARMFeatures.td | 7 +++++++ llvm/lib/Target/ARM/ARMProcessors.td | 1 + llvm/lib/Target/ARM/Thumb2SizeReduction.cpp | 3 +++ llvm/test/CodeGen/Thumb2/avoidmuls.mir | 4 ++-- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 3a2188adbec33..bb437698296ce 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr", "AvoidCPSRPartialUpdate", "true", "Avoid CPSR partial update for OOO execution">; +/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction, +/// prefering the thumb2 MUL which doesn't set flags. +def FeatureAvoidMULS : SubtargetFeature<"avoid-muls", + "AvoidMULS", "true", + "Avoid MULS instructions for M class cores">; + + /// Disable +1 predication cost for instructions updating CPSR. /// Enabled for Cortex-A57. /// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57. diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index 08f62d12f4a9f..b94a5fc161469 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureHasSlowFPVFMx, FeatureUseMISched, FeatureHasNoBranchPredictor, + FeatureAvoidMULS, FeatureFixCMSE_CVE_2021_35465]>; def : ProcessorModel<"star-mc1", CortexM4Model, [ARMv8mMainline, diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index f572af9860073..f4a9915a78b99 100644 --- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, Register Reg1 = MI->getOperand(1).getReg(); // t2MUL is "special". The tied source operand is second, not first. if (MI->getOpcode() == ARM::t2MUL) { + // MULS can be slower than MUL + if (!MinimizeSize && STI->avoidMULS()) + return false; Register Reg2 = MI->getOperand(2).getReg(); // Early exit if the regs aren't all low regs. if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1) diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir index 8d5567482d5cd..09b7e62bee04e 100644 --- a/llvm/test/CodeGen/Thumb2/avoidmuls.mir +++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir @@ -63,5 +63,5 @@ body: | ... # CHECK-LABEL: test -# CHECK: tMUL -# CHECK-NOT: t2MUL +# CHECK: t2MUL +# CHECK-NOT: tMUL From 475683b2d2aa299ea3ef002681564bd5f74dd4f7 Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Wed, 16 Oct 2024 15:59:44 +0100 Subject: [PATCH 2/2] fixup! [ARM] Prefer MUL to MULS on some implementations --- llvm/test/CodeGen/Thumb2/avoidmuls.mir | 67 ++++---------------------- 1 file changed, 10 insertions(+), 57 deletions(-) diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir index 09b7e62bee04e..865152068fdf7 100644 --- a/llvm/test/CodeGen/Thumb2/avoidmuls.mir +++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir @@ -1,67 +1,20 @@ -# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s +# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL +# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS ---- | - target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" - target triple = "thumbv8m.main-arm-none-eabi" - - ; Function Attrs: norecurse nounwind readnone - define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 { - entry: - %cmp6 = icmp sgt i32 %y, 0 - br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup - - for.body.preheader: ; preds = %entry - br label %for.body - - for.cond.cleanup: ; preds = %for.body, %entry - %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ] - ret i32 %sum.0.lcssa - - for.body: ; preds = %for.body, %for.body.preheader - %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ] - %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ] - %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ] - %mul = mul nsw i32 %lsr.iv1, %sum.07 - %lsr.iv.next = add i32 %lsr.iv, -1 - %lsr.iv.next2 = add i32 %lsr.iv1, 1 - %exitcond = icmp eq i32 %lsr.iv.next, 0 - br i1 %exitcond, label %for.cond.cleanup, label %for.body - } - - attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" } - -... --- name: test -tracksRegLiveness: true -liveins: - - { reg: '$r0', virtual-reg: '' } - - { reg: '$r1', virtual-reg: '' } body: | - bb.0.entry: - successors: %bb.1.for.body, %bb.2.for.cond.cleanup - liveins: $r0, $r1 - + bb.0: $r2 = tMOVr $r0, 14, _ $r0 = t2MOVi 1, 14, _, _ - t2CMPri $r1, 1, 14, _, implicit-def $cpsr - t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr - - bb.1.for.body: - successors: %bb.2.for.cond.cleanup, %bb.1.for.body - liveins: $r0, $r1, $r2 - $r0 = t2MUL $r2, killed $r0, 14, _ - $r2 = t2ADDri killed $r2, 1, 14, _, _ - $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr - t2Bcc %bb.1.for.body, 1, killed $cpsr - - bb.2.for.cond.cleanup: - liveins: $r0 - tBX_RET 14, _, implicit $r0 ... -# CHECK-LABEL: test -# CHECK: t2MUL -# CHECK-NOT: tMUL +# MUL-LABEL: test +# MUL: t2MUL +# MUL-NOT: tMUL + +# MULS-LABEL: test +# MULS: tMUL +# MULS-NOT: t2MUL \ No newline at end of file