Skip to content

Conversation

@s-barannikov
Copy link
Contributor

The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input.

@llvmbot
Copy link
Member

llvmbot commented Apr 24, 2025

@llvm/pr-subscribers-backend-arm
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-globalisel

Author: Sergei Barannikov (s-barannikov)

Changes

The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input.


Patch is 40.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137197.diff

11 Files Affected:

  • (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+2-2)
  • (modified) llvm/test/CodeGen/ARM/cttz.ll (+76-64)
  • (modified) llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll (+20-6)
  • (modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+78-52)
  • (modified) llvm/test/CodeGen/RISCV/rv32xtheadbb.ll (+51-34)
  • (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+39-29)
  • (modified) llvm/test/CodeGen/SPARC/ctlz.ll (+34-76)
  • (modified) llvm/test/CodeGen/X86/ctlo.ll (+16-11)
  • (modified) llvm/test/CodeGen/X86/ctlz.ll (+19-13)
  • (modified) llvm/test/CodeGen/X86/cttz.ll (+16-17)
  • (modified) llvm/test/CodeGen/X86/lzcnt-cmp.ll (+74-12)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e8dc7752b23c0..f9dcb472ed1d2 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2552,9 +2552,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
     return false;
 
-  // Only handle legal scalar cases. Anything else requires too much work.
+  // Only handle scalar cases. Anything else requires too much work.
   unsigned SizeInBits = Ty->getScalarSizeInBits();
-  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
+  if (Ty->isVectorTy())
     return false;
 
   // Bail if the value is never zero.
diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index 76adc61c5971f..1146ad64ee709 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -221,43 +221,49 @@ define i64 @test_i64(i64 %a) {
 ;
 ; CHECK-6M-LABEL: test_i64:
 ; CHECK-6M:       @ %bb.0:
-; CHECK-6M-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-6M-NEXT:    push {r4, r5, r7, lr}
+; CHECK-6M-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-6M-NEXT:    push {r4, r5, r6, lr}
+; CHECK-6M-NEXT:    mov r3, r1
 ; CHECK-6M-NEXT:    mov r2, r0
-; CHECK-6M-NEXT:    ldr r5, .LCPI3_0
-; CHECK-6M-NEXT:    adr r3, .LCPI3_1
+; CHECK-6M-NEXT:    movs r1, #0
+; CHECK-6M-NEXT:    orrs r0, r3
+; CHECK-6M-NEXT:    beq .LBB3_6
+; CHECK-6M-NEXT:  @ %bb.1: @ %cond.false
+; CHECK-6M-NEXT:    ldr r6, .LCPI3_0
+; CHECK-6M-NEXT:    adr r4, .LCPI3_1
 ; CHECK-6M-NEXT:    movs r0, #32
-; CHECK-6M-NEXT:    cmp r1, #0
-; CHECK-6M-NEXT:    mov r4, r0
-; CHECK-6M-NEXT:    beq .LBB3_2
-; CHECK-6M-NEXT:  @ %bb.1:
-; CHECK-6M-NEXT:    rsbs r4, r1, #0
-; CHECK-6M-NEXT:    ands r4, r1
-; CHECK-6M-NEXT:    muls r4, r5, r4
-; CHECK-6M-NEXT:    lsrs r1, r4, #27
-; CHECK-6M-NEXT:    ldrb r4, [r3, r1]
-; CHECK-6M-NEXT:  .LBB3_2:
-; CHECK-6M-NEXT:    adds r4, #32
-; CHECK-6M-NEXT:    rsbs r1, r2, #0
-; CHECK-6M-NEXT:    ands r1, r2
-; CHECK-6M-NEXT:    muls r5, r1, r5
-; CHECK-6M-NEXT:    lsrs r1, r5, #27
+; CHECK-6M-NEXT:    cmp r3, #0
+; CHECK-6M-NEXT:    mov r5, r0
+; CHECK-6M-NEXT:    beq .LBB3_3
+; CHECK-6M-NEXT:  @ %bb.2: @ %cond.false
+; CHECK-6M-NEXT:    rsbs r5, r3, #0
+; CHECK-6M-NEXT:    ands r5, r3
+; CHECK-6M-NEXT:    muls r5, r6, r5
+; CHECK-6M-NEXT:    lsrs r3, r5, #27
+; CHECK-6M-NEXT:    ldrb r5, [r4, r3]
+; CHECK-6M-NEXT:  .LBB3_3: @ %cond.false
+; CHECK-6M-NEXT:    adds r5, #32
+; CHECK-6M-NEXT:    rsbs r3, r2, #0
+; CHECK-6M-NEXT:    ands r3, r2
+; CHECK-6M-NEXT:    muls r6, r3, r6
+; CHECK-6M-NEXT:    lsrs r3, r6, #27
 ; CHECK-6M-NEXT:    cmp r2, #0
-; CHECK-6M-NEXT:    bne .LBB3_5
-; CHECK-6M-NEXT:  @ %bb.3:
-; CHECK-6M-NEXT:    beq .LBB3_6
-; CHECK-6M-NEXT:  .LBB3_4:
-; CHECK-6M-NEXT:    movs r1, #0
-; CHECK-6M-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-6M-NEXT:  .LBB3_5:
-; CHECK-6M-NEXT:    ldrb r0, [r3, r1]
-; CHECK-6M-NEXT:    bne .LBB3_4
+; CHECK-6M-NEXT:    bne .LBB3_7
+; CHECK-6M-NEXT:  @ %bb.4: @ %cond.false
+; CHECK-6M-NEXT:    beq .LBB3_8
+; CHECK-6M-NEXT:  .LBB3_5: @ %cond.end
+; CHECK-6M-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-6M-NEXT:  .LBB3_6:
-; CHECK-6M-NEXT:    mov r0, r4
-; CHECK-6M-NEXT:    movs r1, #0
-; CHECK-6M-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-6M-NEXT:    movs r0, #64
+; CHECK-6M-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT:  .LBB3_7: @ %cond.false
+; CHECK-6M-NEXT:    ldrb r0, [r4, r3]
+; CHECK-6M-NEXT:    bne .LBB3_5
+; CHECK-6M-NEXT:  .LBB3_8: @ %cond.false
+; CHECK-6M-NEXT:    mov r0, r5
+; CHECK-6M-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-6M-NEXT:    .p2align 2
-; CHECK-6M-NEXT:  @ %bb.7:
+; CHECK-6M-NEXT:  @ %bb.9:
 ; CHECK-6M-NEXT:  .LCPI3_0:
 ; CHECK-6M-NEXT:    .long 125613361 @ 0x77cb531
 ; CHECK-6M-NEXT:  .LCPI3_1:
@@ -265,43 +271,49 @@ define i64 @test_i64(i64 %a) {
 ;
 ; CHECK-8MBASE-LABEL: test_i64:
 ; CHECK-8MBASE:       @ %bb.0:
-; CHECK-8MBASE-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-8MBASE-NEXT:    push {r4, r5, r7, lr}
+; CHECK-8MBASE-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT:    push {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT:    mov r3, r1
 ; CHECK-8MBASE-NEXT:    mov r2, r0
-; CHECK-8MBASE-NEXT:    movw r5, #46385
-; CHECK-8MBASE-NEXT:    movt r5, #1916
-; CHECK-8MBASE-NEXT:    adr r3, .LCPI3_0
+; CHECK-8MBASE-NEXT:    movs r1, #0
+; CHECK-8MBASE-NEXT:    orrs r0, r3
+; CHECK-8MBASE-NEXT:    beq .LBB3_6
+; CHECK-8MBASE-NEXT:  @ %bb.1: @ %cond.false
+; CHECK-8MBASE-NEXT:    movw r6, #46385
+; CHECK-8MBASE-NEXT:    movt r6, #1916
+; CHECK-8MBASE-NEXT:    adr r4, .LCPI3_0
 ; CHECK-8MBASE-NEXT:    movs r0, #32
-; CHECK-8MBASE-NEXT:    mov r4, r0
-; CHECK-8MBASE-NEXT:    cbz r1, .LBB3_2
-; CHECK-8MBASE-NEXT:  @ %bb.1:
-; CHECK-8MBASE-NEXT:    rsbs r4, r1, #0
-; CHECK-8MBASE-NEXT:    ands r4, r1
-; CHECK-8MBASE-NEXT:    muls r4, r5, r4
-; CHECK-8MBASE-NEXT:    lsrs r1, r4, #27
-; CHECK-8MBASE-NEXT:    ldrb r4, [r3, r1]
-; CHECK-8MBASE-NEXT:  .LBB3_2:
-; CHECK-8MBASE-NEXT:    adds r4, #32
-; CHECK-8MBASE-NEXT:    rsbs r1, r2, #0
-; CHECK-8MBASE-NEXT:    ands r1, r2
-; CHECK-8MBASE-NEXT:    muls r5, r1, r5
-; CHECK-8MBASE-NEXT:    lsrs r1, r5, #27
+; CHECK-8MBASE-NEXT:    mov r5, r0
+; CHECK-8MBASE-NEXT:    cbz r3, .LBB3_3
+; CHECK-8MBASE-NEXT:  @ %bb.2: @ %cond.false
+; CHECK-8MBASE-NEXT:    rsbs r5, r3, #0
+; CHECK-8MBASE-NEXT:    ands r5, r3
+; CHECK-8MBASE-NEXT:    muls r5, r6, r5
+; CHECK-8MBASE-NEXT:    lsrs r3, r5, #27
+; CHECK-8MBASE-NEXT:    ldrb r5, [r4, r3]
+; CHECK-8MBASE-NEXT:  .LBB3_3: @ %cond.false
+; CHECK-8MBASE-NEXT:    adds r5, #32
+; CHECK-8MBASE-NEXT:    rsbs r3, r2, #0
+; CHECK-8MBASE-NEXT:    ands r3, r2
+; CHECK-8MBASE-NEXT:    muls r6, r3, r6
+; CHECK-8MBASE-NEXT:    lsrs r3, r6, #27
 ; CHECK-8MBASE-NEXT:    cmp r2, #0
-; CHECK-8MBASE-NEXT:    bne .LBB3_5
-; CHECK-8MBASE-NEXT:  @ %bb.3:
-; CHECK-8MBASE-NEXT:    beq .LBB3_6
-; CHECK-8MBASE-NEXT:  .LBB3_4:
-; CHECK-8MBASE-NEXT:    movs r1, #0
-; CHECK-8MBASE-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-8MBASE-NEXT:  .LBB3_5:
-; CHECK-8MBASE-NEXT:    ldrb r0, [r3, r1]
-; CHECK-8MBASE-NEXT:    bne .LBB3_4
+; CHECK-8MBASE-NEXT:    bne .LBB3_7
+; CHECK-8MBASE-NEXT:  @ %bb.4: @ %cond.false
+; CHECK-8MBASE-NEXT:    beq .LBB3_8
+; CHECK-8MBASE-NEXT:  .LBB3_5: @ %cond.end
+; CHECK-8MBASE-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-8MBASE-NEXT:  .LBB3_6:
-; CHECK-8MBASE-NEXT:    mov r0, r4
-; CHECK-8MBASE-NEXT:    movs r1, #0
-; CHECK-8MBASE-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-8MBASE-NEXT:    movs r0, #64
+; CHECK-8MBASE-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT:  .LBB3_7: @ %cond.false
+; CHECK-8MBASE-NEXT:    ldrb r0, [r4, r3]
+; CHECK-8MBASE-NEXT:    bne .LBB3_5
+; CHECK-8MBASE-NEXT:  .LBB3_8: @ %cond.false
+; CHECK-8MBASE-NEXT:    mov r0, r5
+; CHECK-8MBASE-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-8MBASE-NEXT:    .p2align 2
-; CHECK-8MBASE-NEXT:  @ %bb.7:
+; CHECK-8MBASE-NEXT:  @ %bb.9:
 ; CHECK-8MBASE-NEXT:  .LCPI3_0:
 ; CHECK-8MBASE-NEXT:    .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t"
   %tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index f9af74d6ec323..0632caecf8907 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -62,6 +62,9 @@ declare i64 @llvm.ctlz.i64(i64, i1)
 define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a2, a0, a1
+; RV32I-NEXT:    beqz a2, .LBB1_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    lui a3, 209715
 ; RV32I-NEXT:    lui a6, 61681
@@ -69,8 +72,8 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    addi a4, a3, 819
 ; RV32I-NEXT:    addi a3, a6, -241
 ; RV32I-NEXT:    li a2, 32
-; RV32I-NEXT:    beqz a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    beqz a1, .LBB1_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    srli a0, a1, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -99,7 +102,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:  .LBB1_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_4:
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -195,14 +202,17 @@ declare i64 @llvm.cttz.i64(i64, i1)
 define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: cttz_i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a2, a0, a1
+; RV32I-NEXT:    beqz a2, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    lui a3, 209715
 ; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
 ; RV32I-NEXT:    addi a3, a3, 819
 ; RV32I-NEXT:    addi a2, a5, -241
-; RV32I-NEXT:    beqz a0, .LBB3_2
-; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    beqz a0, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    not a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a0, a1, a0
@@ -223,7 +233,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB3_4:
 ; RV32I-NEXT:    not a0, a1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    and a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index a46168f114bb9..3a7d31253b05d 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -374,39 +374,42 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    and a0, s0, a0
-; RV32I-NEXT:    lui a1, 30667
-; RV32I-NEXT:    addi s3, a1, 1329
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    beqz a1, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi s2, a2, 1329
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT:    neg a0, s2
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT:    neg a0, s0
+; RV32I-NEXT:    and a0, s0, a0
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __mulsi3
-; RV32I-NEXT:    bnez s2, .LBB3_3
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    li a0, 32
-; RV32I-NEXT:    beqz s0, .LBB3_4
-; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    srli s1, s1, 27
-; RV32I-NEXT:    add s1, s4, s1
-; RV32I-NEXT:    lbu a0, 0(s1)
-; RV32I-NEXT:    j .LBB3_5
-; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    bnez s4, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    add a0, s4, a0
+; RV32I-NEXT:    add a0, s3, a0
 ; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    bnez s0, .LBB3_2
-; RV32I-NEXT:  .LBB3_4:
 ; RV32I-NEXT:    addi a0, a0, 32
-; RV32I-NEXT:  .LBB3_5:
+; RV32I-NEXT:    j .LBB3_5
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    j .LBB3_6
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    srli s1, s1, 27
+; RV32I-NEXT:    add s1, s3, s1
+; RV32I-NEXT:    lbu a0, 0(s1)
+; RV32I-NEXT:  .LBB3_5: # %cond.false
 ; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:  .LBB3_6: # %cond.end
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -441,33 +444,35 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ;
 ; RV32M-LABEL: test_cttz_i64:
 ; RV32M:       # %bb.0:
+; RV32M-NEXT:    or a2, a0, a1
+; RV32M-NEXT:    beqz a2, .LBB3_3
+; RV32M-NEXT:  # %bb.1: # %cond.false
 ; RV32M-NEXT:    lui a2, 30667
 ; RV32M-NEXT:    addi a3, a2, 1329
 ; RV32M-NEXT:    lui a2, %hi(.LCPI3_0)
 ; RV32M-NEXT:    addi a2, a2, %lo(.LCPI3_0)
-; RV32M-NEXT:    bnez a1, .LBB3_3
-; RV32M-NEXT:  # %bb.1:
-; RV32M-NEXT:    li a1, 32
-; RV32M-NEXT:    beqz a0, .LBB3_4
-; RV32M-NEXT:  .LBB3_2:
-; RV32M-NEXT:    neg a1, a0
-; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    bnez a0, .LBB3_4
+; RV32M-NEXT:  # %bb.2: # %cond.false
+; RV32M-NEXT:    neg a0, a1
+; RV32M-NEXT:    and a0, a1, a0
 ; RV32M-NEXT:    mul a0, a0, a3
 ; RV32M-NEXT:    srli a0, a0, 27
 ; RV32M-NEXT:    add a0, a2, a0
 ; RV32M-NEXT:    lbu a0, 0(a0)
+; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
 ; RV32M-NEXT:  .LBB3_3:
-; RV32M-NEXT:    neg a4, a1
-; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    mul a1, a1, a3
-; RV32M-NEXT:    srli a1, a1, 27
-; RV32M-NEXT:    add a1, a2, a1
-; RV32M-NEXT:    lbu a1, 0(a1)
-; RV32M-NEXT:    bnez a0, .LBB3_2
+; RV32M-NEXT:    li a1, 0
+; RV32M-NEXT:    li a0, 64
+; RV32M-NEXT:    ret
 ; RV32M-NEXT:  .LBB3_4:
-; RV32M-NEXT:    addi a0, a1, 32
+; RV32M-NEXT:    neg a1, a0
+; RV32M-NEXT:    and a0, a0, a1
+; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    srli a0, a0, 27
+; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    lbu a0, 0(a0)
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
 ;
@@ -510,21 +515,28 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ;
 ; RV32XTHEADBB-LABEL: test_cttz_i64:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    bnez a0, .LBB3_2
-; RV32XTHEADBB-NEXT:  # %bb.1:
+; RV32XTHEADBB-NEXT:    or a2, a0, a1
+; RV32XTHEADBB-NEXT:    beqz a2, .LBB3_3
+; RV32XTHEADBB-NEXT:  # %bb.1: # %cond.false
+; RV32XTHEADBB-NEXT:    bnez a0, .LBB3_4
+; RV32XTHEADBB-NEXT:  # %bb.2: # %cond.false
 ; RV32XTHEADBB-NEXT:    addi a0, a1, -1
 ; RV32XTHEADBB-NEXT:    not a1, a1
 ; RV32XTHEADBB-NEXT:    and a0, a1, a0
 ; RV32XTHEADBB-NEXT:    th.ff1 a0, a0
 ; RV32XTHEADBB-NEXT:    li a1, 64
-; RV32XTHEADBB-NEXT:    j .LBB3_3
-; RV32XTHEADBB-NEXT:  .LBB3_2:
+; RV32XTHEADBB-NEXT:    j .LBB3_5
+; RV32XTHEADBB-NEXT:  .LBB3_3:
+; RV32XTHEADBB-NEXT:    li a1, 0
+; RV32XTHEADBB-NEXT:    li a0, 64
+; RV32XTHEADBB-NEXT:    ret
+; RV32XTHEADBB-NEXT:  .LBB3_4:
 ; RV32XTHEADBB-NEXT:    addi a1, a0, -1
 ; RV32XTHEADBB-NEXT:    not a0, a0
 ; RV32XTHEADBB-NEXT:    and a0, a0, a1
 ; RV32XTHEADBB-NEXT:    th.ff1 a0, a0
 ; RV32XTHEADBB-NEXT:    li a1, 32
-; RV32XTHEADBB-NEXT:  .LBB3_3:
+; RV32XTHEADBB-NEXT:  .LBB3_5: # %cond.false
 ; RV32XTHEADBB-NEXT:    sub a0, a1, a0
 ; RV32XTHEADBB-NEXT:    li a1, 0
 ; RV32XTHEADBB-NEXT:    ret
@@ -1348,14 +1360,17 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a2, a0, a1
+; RV32I-NEXT:    beqz a2, .LBB11_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    lui a3, 209715
 ; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
 ; RV32I-NEXT:    addi a3, a3, 819
 ; RV32I-NEXT:    addi a2, a5, -241
-; RV32I-NEXT:    bnez a1, .LBB11_2
-; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    bnez a1, .LBB11_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -1385,7 +1400,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    addi a0, a0, 32
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_4:
 ; RV32I-NEXT:    srli a0, a1, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -1468,6 +1487,9 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ;
 ; RV32M-LABEL: test_ctlz_i64:
 ; RV32M:       # %bb.0:
+; RV32M-NEXT:    or a2, a0, a1
+; RV32M-NEXT:    beqz a2, .LBB11_3
+; RV32M-NEXT:  # %bb.1: # %cond.false
 ; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    lui a3, 209715
 ; RV32M-NEXT:    lui a6, 61681
@@ -1476,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    addi a4, a3, 819
 ; RV32M-NEXT:    addi a3, a6, -241
 ; RV32M-NEXT:    addi a2, a7, 257
-; RV32M-NEXT:    bnez a1, .LBB11_2
-; RV32M-NEXT:  # %bb.1:
+; RV32M-NEXT:    bnez a1, .LBB11_4
+; RV32M-NEXT:  # %bb.2: # %cond.false
 ; RV32M-NEXT:    srli a1, a0, 1
 ; RV32M-NEXT:    or a0, a0, a1
 ; RV32M-NEXT:    srli a1, a0, 2
@@ -1504,7 +1526,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
-; RV32M-NEXT:  .LBB11_2:
+; RV32M-NEXT:  .LBB11_3:
+; RV32M-NEXT:    li a1, 0
+; RV32M-NEXT:    li a0, 64
+; RV32M-NEXT:    ret
+; RV32M-NEXT:  .LBB11_4:
 ; RV32M-NEXT:    srli a0, a1, 1
 ; RV32M-NEXT:    or a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 2
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index 04a2f67c4942b..723437a610ff8 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -61,14 +61,17 @@ declare i64 @llvm.ctlz.i64(i64, i1)
 define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a2, a0, a1
+; RV32I-NEXT:    beqz a2, .LBB1_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    lui a3, 209715
 ; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
 ; RV32I-NEXT:    addi a3, a3, 819
 ; RV32I-NEXT:    addi a2, a5, -241
-; RV32I-NEXT:    bnez a1, .LBB1_2
-; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    bnez a1, .LBB1_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -98,7 +101,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    addi a0, a0, 32
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:  .LBB1_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_4:
 ; RV32I-NEXT:    srli a0, a1, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
@@ -200,39 +207,42 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s2, a1
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    and a0, s0, a0
-; RV32I-NEXT:    lui a1, 30667
-; RV32I-NEXT:    addi s3, a1, 1329
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    mv s0, a1
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    beqz a1, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    lui a2, 30667
+; RV32I-NEXT:    addi s2, a2, 1329
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __mulsi3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT:    neg a0, s2
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT:    neg a0, s0
+; RV32I-NEXT:    and a0, s0, a0
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __mulsi3
-; RV32I-NEXT:    bnez s2, .LBB3_3
-; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    li a0, 32
-; RV32I-NEXT:    beqz s0, .LBB3_4
-; RV32I-NEXT:  .LBB3_2:
-; RV32I-NEXT:    srli s1, s1, 27
-; RV32I-NEXT:    add s1, s4, s1
-; RV32I-NEXT:    lbu a0, 0(s1)
-; RV32I-NEXT:    j .LBB3_5
-; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    bnez s4, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
 ; RV32I-NEXT:    srli a0, a0, 27
-; RV32I-NEXT:    add a0, s4, a0
+; RV32I-NEXT:    add a0, s3, a0
 ; RV32I-NEXT:    lbu a0, 0(a0)
-; RV32I-NEXT:    bnez s0, .LBB3_2
-; RV32I-NEXT:  .LBB3_4:
 ; RV32I-NEXT:    addi ...
[truncated]

The code below the removed check looks generic enough to support
arbitrary integer widths. This helps 32-bit targets avoid expensive
expansion/libcalls in the case of zero input.
@s-barannikov s-barannikov force-pushed the despeculate-ctlz-cttz branch from 2af83da to 2a33e28 Compare April 24, 2025 15:38
@s-barannikov s-barannikov requested a review from phoebewang April 24, 2025 15:51
@s-barannikov s-barannikov force-pushed the despeculate-ctlz-cttz branch from becd66a to 13fe68d Compare April 25, 2025 13:44
@s-barannikov
Copy link
Contributor Author

@topperc Do RISC-V changes look fine?

Comment on lines +301 to +303
; CHECK-8MBASE-NEXT: bne .LBB3_7
; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
; CHECK-8MBASE-NEXT: beq .LBB3_8
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW
There are some strange branches here and below (on the left, too).
X86 eliminates them in X86TargetLowering::EmitLoweredSelect. AFACT ARM's equivalent doesn't do this optimization.
There is also a PPCBranchCoalescing pass the could could help here. It is more or less generic, but currently lives under PowerPC directory.

@s-barannikov s-barannikov requested review from lukel97 and preames April 26, 2025 14:07
@s-barannikov
Copy link
Contributor Author

ping, someone?

@s-barannikov s-barannikov requested a review from koachan April 29, 2025 06:04
Copy link
Contributor

@lukel97 lukel97 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RISC-V changes LGTM

; RV32M-NEXT: add a1, a2, a1
; RV32M-NEXT: lbu a1, 0(a1)
; RV32M-NEXT: bnez a0, .LBB3_2
; RV32M-NEXT: li a1, 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this patch, but a1 needs to be zero to get here, can we remove this li?

Copy link
Contributor Author

@s-barannikov s-barannikov Apr 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is something that could be handled by RISCVRedundantCopyElimination (after some improvements like supporting AND/OR), but the context for the optimization is created later, by TailDuplicatePass. (It is +14 passes later.)

bb.0 (%ir-block.0):
  successors: %bb.1(0x30000000), %bb.3(0x50000000); %bb.1(37.50%), %bb.3(62.50%)
  liveins: $x10, $x11
  renamable $x12 = OR renamable $x10, renamable $x11
  BNE killed renamable $x12, $x0, %bb.3

bb.1:
; predecessors: %bb.0

  renamable $x11 = COPY $x0
  renamable $x10 = ADDI $x0, 64

I think the extra li may not be a big problem here as the result of cttz is usually truncated to 32 bits.

@koachan
Copy link
Contributor

koachan commented Apr 29, 2025

SPARC changes seem OK

@s-barannikov s-barannikov merged commit becd418 into llvm:main Apr 29, 2025
9 of 10 checks passed
@s-barannikov s-barannikov deleted the despeculate-ctlz-cttz branch April 29, 2025 19:33
IanWood1 pushed a commit to IanWood1/llvm-project that referenced this pull request May 6, 2025
The code below the removed check looks generic enough to support
arbitrary integer widths. This change helps 32-bit targets avoid
expensive expansion/libcalls in the case of zero input.

Pull Request: llvm#137197
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request May 6, 2025
…#137197)

The code below the removed check looks generic enough to support
arbitrary integer widths. This change helps 32-bit targets avoid
expensive expansion/libcalls in the case of zero input.

Pull Request: llvm/llvm-project#137197
GeorgeARM pushed a commit to GeorgeARM/llvm-project that referenced this pull request May 7, 2025
The code below the removed check looks generic enough to support
arbitrary integer widths. This change helps 32-bit targets avoid
expensive expansion/libcalls in the case of zero input.

Pull Request: llvm#137197
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

6 participants