From 1247fddf3676543837b71965c88f021072513c63 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Tue, 24 Jun 2025 09:06:18 +0200 Subject: [PATCH] [SimplifyCFG] Relax `cttz` cost check in `simplifySwitchOfPowersOfTwo` We should be able to allow `simplifySwitchOfPowersOfTwo` transform to take place, as, on recent X86 targets, the weighted latency-size appears to be 2. This favours computing trailing zeroes and indexing into a smaller value table, over generating a jump table with an indirect branch, which overall should be more efficient. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 20 +++++------ .../X86/switch-of-powers-of-two.ll | 35 +++++++++++++++++++ 2 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index eb52c1b7e6fba..e205551658aa5 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7198,8 +7198,10 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, /// will be transformed to: /// switch (count_trailing_zeros(C)) { case 0: case 1: case 6: case 7: } /// -/// This transformation allows better lowering and could allow transforming into -/// a lookup table. +/// This transformation allows better lowering and may transform the switch +/// instruction into a sequence of bit manipulation and a smaller +/// log2(C)-indexed value table (instead of traditionally emitting a load of the +/// address of the jump target, and indirectly jump to it). static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, const DataLayout &DL, const TargetTransformInfo &TTI) { @@ -7211,17 +7213,15 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) return false; - const auto CttzIntrinsicCost = TTI.getIntrinsicInstrCost( - IntrinsicCostAttributes(Intrinsic::cttz, CondTy, - {Condition, ConstantInt::getTrue(Context)}), - TTI::TCK_SizeAndLatency); - - if (CttzIntrinsicCost > TTI::TCC_Basic) - // Inserting intrinsic is too expensive. + // Ensure trailing zeroes count intrinsic emission is not too expensive. + IntrinsicCostAttributes Attrs(Intrinsic::cttz, CondTy, + {Condition, ConstantInt::getTrue(Context)}); + if (TTI.getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > + TTI::TCC_Basic * 2) return false; // Only bother with this optimization if there are more than 3 switch cases. - // SDAG will only bother creating jump tables for 4 or more cases. + // SDAG will start emitting jump tables for 4 or more cases. if (SI->getNumCases() < 4) return false; diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll new file mode 100644 index 0000000000000..529826758c138 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='simplifycfg' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define i32 @switch_of_powers_two(i32 %arg) { +; CHECK-LABEL: define i32 @switch_of_powers_two( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true) +; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4 +; CHECK-NEXT: ret i32 [[SWITCH_LOAD]] +; +entry: + switch i32 %arg, label %default_case [ + i32 1, label %bb1 + i32 8, label %bb2 + i32 16, label %bb3 + i32 32, label %bb4 + i32 64, label %bb5 + ] + + +default_case: unreachable +bb1: br label %return +bb2: br label %return +bb3: br label %return +bb4: br label %return +bb5: br label %return + +return: + %phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ] + ret i32 %phi +}