-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[CGP] Despeculate ctlz/cttz with "illegal" integer types #137197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-llvm-globalisel Author: Sergei Barannikov (s-barannikov) ChangesThe code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input. Patch is 40.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137197.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e8dc7752b23c0..f9dcb472ed1d2 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2552,9 +2552,9 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
return false;
- // Only handle legal scalar cases. Anything else requires too much work.
+ // Only handle scalar cases. Anything else requires too much work.
unsigned SizeInBits = Ty->getScalarSizeInBits();
- if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
+ if (Ty->isVectorTy())
return false;
// Bail if the value is never zero.
diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index 76adc61c5971f..1146ad64ee709 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -221,43 +221,49 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-6M-LABEL: test_i64:
; CHECK-6M: @ %bb.0:
-; CHECK-6M-NEXT: .save {r4, r5, r7, lr}
-; CHECK-6M-NEXT: push {r4, r5, r7, lr}
+; CHECK-6M-NEXT: .save {r4, r5, r6, lr}
+; CHECK-6M-NEXT: push {r4, r5, r6, lr}
+; CHECK-6M-NEXT: mov r3, r1
; CHECK-6M-NEXT: mov r2, r0
-; CHECK-6M-NEXT: ldr r5, .LCPI3_0
-; CHECK-6M-NEXT: adr r3, .LCPI3_1
+; CHECK-6M-NEXT: movs r1, #0
+; CHECK-6M-NEXT: orrs r0, r3
+; CHECK-6M-NEXT: beq .LBB3_6
+; CHECK-6M-NEXT: @ %bb.1: @ %cond.false
+; CHECK-6M-NEXT: ldr r6, .LCPI3_0
+; CHECK-6M-NEXT: adr r4, .LCPI3_1
; CHECK-6M-NEXT: movs r0, #32
-; CHECK-6M-NEXT: cmp r1, #0
-; CHECK-6M-NEXT: mov r4, r0
-; CHECK-6M-NEXT: beq .LBB3_2
-; CHECK-6M-NEXT: @ %bb.1:
-; CHECK-6M-NEXT: rsbs r4, r1, #0
-; CHECK-6M-NEXT: ands r4, r1
-; CHECK-6M-NEXT: muls r4, r5, r4
-; CHECK-6M-NEXT: lsrs r1, r4, #27
-; CHECK-6M-NEXT: ldrb r4, [r3, r1]
-; CHECK-6M-NEXT: .LBB3_2:
-; CHECK-6M-NEXT: adds r4, #32
-; CHECK-6M-NEXT: rsbs r1, r2, #0
-; CHECK-6M-NEXT: ands r1, r2
-; CHECK-6M-NEXT: muls r5, r1, r5
-; CHECK-6M-NEXT: lsrs r1, r5, #27
+; CHECK-6M-NEXT: cmp r3, #0
+; CHECK-6M-NEXT: mov r5, r0
+; CHECK-6M-NEXT: beq .LBB3_3
+; CHECK-6M-NEXT: @ %bb.2: @ %cond.false
+; CHECK-6M-NEXT: rsbs r5, r3, #0
+; CHECK-6M-NEXT: ands r5, r3
+; CHECK-6M-NEXT: muls r5, r6, r5
+; CHECK-6M-NEXT: lsrs r3, r5, #27
+; CHECK-6M-NEXT: ldrb r5, [r4, r3]
+; CHECK-6M-NEXT: .LBB3_3: @ %cond.false
+; CHECK-6M-NEXT: adds r5, #32
+; CHECK-6M-NEXT: rsbs r3, r2, #0
+; CHECK-6M-NEXT: ands r3, r2
+; CHECK-6M-NEXT: muls r6, r3, r6
+; CHECK-6M-NEXT: lsrs r3, r6, #27
; CHECK-6M-NEXT: cmp r2, #0
-; CHECK-6M-NEXT: bne .LBB3_5
-; CHECK-6M-NEXT: @ %bb.3:
-; CHECK-6M-NEXT: beq .LBB3_6
-; CHECK-6M-NEXT: .LBB3_4:
-; CHECK-6M-NEXT: movs r1, #0
-; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
-; CHECK-6M-NEXT: .LBB3_5:
-; CHECK-6M-NEXT: ldrb r0, [r3, r1]
-; CHECK-6M-NEXT: bne .LBB3_4
+; CHECK-6M-NEXT: bne .LBB3_7
+; CHECK-6M-NEXT: @ %bb.4: @ %cond.false
+; CHECK-6M-NEXT: beq .LBB3_8
+; CHECK-6M-NEXT: .LBB3_5: @ %cond.end
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
; CHECK-6M-NEXT: .LBB3_6:
-; CHECK-6M-NEXT: mov r0, r4
-; CHECK-6M-NEXT: movs r1, #0
-; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
+; CHECK-6M-NEXT: movs r0, #64
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT: .LBB3_7: @ %cond.false
+; CHECK-6M-NEXT: ldrb r0, [r4, r3]
+; CHECK-6M-NEXT: bne .LBB3_5
+; CHECK-6M-NEXT: .LBB3_8: @ %cond.false
+; CHECK-6M-NEXT: mov r0, r5
+; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
; CHECK-6M-NEXT: .p2align 2
-; CHECK-6M-NEXT: @ %bb.7:
+; CHECK-6M-NEXT: @ %bb.9:
; CHECK-6M-NEXT: .LCPI3_0:
; CHECK-6M-NEXT: .long 125613361 @ 0x77cb531
; CHECK-6M-NEXT: .LCPI3_1:
@@ -265,43 +271,49 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-8MBASE-LABEL: test_i64:
; CHECK-8MBASE: @ %bb.0:
-; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr}
-; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr}
+; CHECK-8MBASE-NEXT: .save {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT: push {r4, r5, r6, lr}
+; CHECK-8MBASE-NEXT: mov r3, r1
; CHECK-8MBASE-NEXT: mov r2, r0
-; CHECK-8MBASE-NEXT: movw r5, #46385
-; CHECK-8MBASE-NEXT: movt r5, #1916
-; CHECK-8MBASE-NEXT: adr r3, .LCPI3_0
+; CHECK-8MBASE-NEXT: movs r1, #0
+; CHECK-8MBASE-NEXT: orrs r0, r3
+; CHECK-8MBASE-NEXT: beq .LBB3_6
+; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false
+; CHECK-8MBASE-NEXT: movw r6, #46385
+; CHECK-8MBASE-NEXT: movt r6, #1916
+; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0
; CHECK-8MBASE-NEXT: movs r0, #32
-; CHECK-8MBASE-NEXT: mov r4, r0
-; CHECK-8MBASE-NEXT: cbz r1, .LBB3_2
-; CHECK-8MBASE-NEXT: @ %bb.1:
-; CHECK-8MBASE-NEXT: rsbs r4, r1, #0
-; CHECK-8MBASE-NEXT: ands r4, r1
-; CHECK-8MBASE-NEXT: muls r4, r5, r4
-; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
-; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1]
-; CHECK-8MBASE-NEXT: .LBB3_2:
-; CHECK-8MBASE-NEXT: adds r4, #32
-; CHECK-8MBASE-NEXT: rsbs r1, r2, #0
-; CHECK-8MBASE-NEXT: ands r1, r2
-; CHECK-8MBASE-NEXT: muls r5, r1, r5
-; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
+; CHECK-8MBASE-NEXT: mov r5, r0
+; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3
+; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false
+; CHECK-8MBASE-NEXT: rsbs r5, r3, #0
+; CHECK-8MBASE-NEXT: ands r5, r3
+; CHECK-8MBASE-NEXT: muls r5, r6, r5
+; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
+; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3]
+; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false
+; CHECK-8MBASE-NEXT: adds r5, #32
+; CHECK-8MBASE-NEXT: rsbs r3, r2, #0
+; CHECK-8MBASE-NEXT: ands r3, r2
+; CHECK-8MBASE-NEXT: muls r6, r3, r6
+; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
; CHECK-8MBASE-NEXT: cmp r2, #0
-; CHECK-8MBASE-NEXT: bne .LBB3_5
-; CHECK-8MBASE-NEXT: @ %bb.3:
-; CHECK-8MBASE-NEXT: beq .LBB3_6
-; CHECK-8MBASE-NEXT: .LBB3_4:
-; CHECK-8MBASE-NEXT: movs r1, #0
-; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
-; CHECK-8MBASE-NEXT: .LBB3_5:
-; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1]
-; CHECK-8MBASE-NEXT: bne .LBB3_4
+; CHECK-8MBASE-NEXT: bne .LBB3_7
+; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
+; CHECK-8MBASE-NEXT: beq .LBB3_8
+; CHECK-8MBASE-NEXT: .LBB3_5: @ %cond.end
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
; CHECK-8MBASE-NEXT: .LBB3_6:
-; CHECK-8MBASE-NEXT: mov r0, r4
-; CHECK-8MBASE-NEXT: movs r1, #0
-; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
+; CHECK-8MBASE-NEXT: movs r0, #64
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT: .LBB3_7: @ %cond.false
+; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3]
+; CHECK-8MBASE-NEXT: bne .LBB3_5
+; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false
+; CHECK-8MBASE-NEXT: mov r0, r5
+; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
; CHECK-8MBASE-NEXT: .p2align 2
-; CHECK-8MBASE-NEXT: @ %bb.7:
+; CHECK-8MBASE-NEXT: @ %bb.9:
; CHECK-8MBASE-NEXT: .LCPI3_0:
; CHECK-8MBASE-NEXT: .ascii "\000\001\034\002\035\016\030\003\036\026\024\017\031\021\004\b\037\033\r\027\025\023\020\007\032\f\022\006\013\005\n\t"
%tmp = call i64 @llvm.cttz.i64(i64 %a, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index f9af74d6ec323..0632caecf8907 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -62,6 +62,9 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB1_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a6, 61681
@@ -69,8 +72,8 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a4, a3, 819
; RV32I-NEXT: addi a3, a6, -241
; RV32I-NEXT: li a2, 32
-; RV32I-NEXT: beqz a1, .LBB1_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: beqz a1, .LBB1_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -99,7 +102,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: sub a0, a2, a0
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -195,14 +202,17 @@ declare i64 @llvm.cttz.i64(i64, i1)
define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-LABEL: cttz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: beqz a0, .LBB3_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: beqz a0, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: not a1, a0
; RV32I-NEXT: addi a0, a0, -1
; RV32I-NEXT: and a0, a1, a0
@@ -223,7 +233,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: srli a0, a0, 24
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB3_2:
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: not a0, a1
; RV32I-NEXT: addi a1, a1, -1
; RV32I-NEXT: and a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index a46168f114bb9..3a7d31253b05d 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -374,39 +374,42 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: and a0, s0, a0
-; RV32I-NEXT: lui a1, 30667
-; RV32I-NEXT: addi s3, a1, 1329
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: or a1, a0, a1
+; RV32I-NEXT: beqz a1, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: lui a2, 30667
+; RV32I-NEXT: addi s2, a2, 1329
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT: neg a0, s2
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT: neg a0, s0
+; RV32I-NEXT: and a0, s0, a0
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s2, .LBB3_3
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: li a0, 32
-; RV32I-NEXT: beqz s0, .LBB3_4
-; RV32I-NEXT: .LBB3_2:
-; RV32I-NEXT: srli s1, s1, 27
-; RV32I-NEXT: add s1, s4, s1
-; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: j .LBB3_5
-; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: bnez s4, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: add a0, s4, a0
+; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: lbu a0, 0(a0)
-; RV32I-NEXT: bnez s0, .LBB3_2
-; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi a0, a0, 32
-; RV32I-NEXT: .LBB3_5:
+; RV32I-NEXT: j .LBB3_5
+; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: .LBB3_4:
+; RV32I-NEXT: srli s1, s1, 27
+; RV32I-NEXT: add s1, s3, s1
+; RV32I-NEXT: lbu a0, 0(s1)
+; RV32I-NEXT: .LBB3_5: # %cond.false
; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -441,33 +444,35 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
;
; RV32M-LABEL: test_cttz_i64:
; RV32M: # %bb.0:
+; RV32M-NEXT: or a2, a0, a1
+; RV32M-NEXT: beqz a2, .LBB3_3
+; RV32M-NEXT: # %bb.1: # %cond.false
; RV32M-NEXT: lui a2, 30667
; RV32M-NEXT: addi a3, a2, 1329
; RV32M-NEXT: lui a2, %hi(.LCPI3_0)
; RV32M-NEXT: addi a2, a2, %lo(.LCPI3_0)
-; RV32M-NEXT: bnez a1, .LBB3_3
-; RV32M-NEXT: # %bb.1:
-; RV32M-NEXT: li a1, 32
-; RV32M-NEXT: beqz a0, .LBB3_4
-; RV32M-NEXT: .LBB3_2:
-; RV32M-NEXT: neg a1, a0
-; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: bnez a0, .LBB3_4
+; RV32M-NEXT: # %bb.2: # %cond.false
+; RV32M-NEXT: neg a0, a1
+; RV32M-NEXT: and a0, a1, a0
; RV32M-NEXT: mul a0, a0, a3
; RV32M-NEXT: srli a0, a0, 27
; RV32M-NEXT: add a0, a2, a0
; RV32M-NEXT: lbu a0, 0(a0)
+; RV32M-NEXT: addi a0, a0, 32
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_3:
-; RV32M-NEXT: neg a4, a1
-; RV32M-NEXT: and a1, a1, a4
-; RV32M-NEXT: mul a1, a1, a3
-; RV32M-NEXT: srli a1, a1, 27
-; RV32M-NEXT: add a1, a2, a1
-; RV32M-NEXT: lbu a1, 0(a1)
-; RV32M-NEXT: bnez a0, .LBB3_2
+; RV32M-NEXT: li a1, 0
+; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_4:
-; RV32M-NEXT: addi a0, a1, 32
+; RV32M-NEXT: neg a1, a0
+; RV32M-NEXT: and a0, a0, a1
+; RV32M-NEXT: mul a0, a0, a3
+; RV32M-NEXT: srli a0, a0, 27
+; RV32M-NEXT: add a0, a2, a0
+; RV32M-NEXT: lbu a0, 0(a0)
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
;
@@ -510,21 +515,28 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
;
; RV32XTHEADBB-LABEL: test_cttz_i64:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: bnez a0, .LBB3_2
-; RV32XTHEADBB-NEXT: # %bb.1:
+; RV32XTHEADBB-NEXT: or a2, a0, a1
+; RV32XTHEADBB-NEXT: beqz a2, .LBB3_3
+; RV32XTHEADBB-NEXT: # %bb.1: # %cond.false
+; RV32XTHEADBB-NEXT: bnez a0, .LBB3_4
+; RV32XTHEADBB-NEXT: # %bb.2: # %cond.false
; RV32XTHEADBB-NEXT: addi a0, a1, -1
; RV32XTHEADBB-NEXT: not a1, a1
; RV32XTHEADBB-NEXT: and a0, a1, a0
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 64
-; RV32XTHEADBB-NEXT: j .LBB3_3
-; RV32XTHEADBB-NEXT: .LBB3_2:
+; RV32XTHEADBB-NEXT: j .LBB3_5
+; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: li a1, 0
+; RV32XTHEADBB-NEXT: li a0, 64
+; RV32XTHEADBB-NEXT: ret
+; RV32XTHEADBB-NEXT: .LBB3_4:
; RV32XTHEADBB-NEXT: addi a1, a0, -1
; RV32XTHEADBB-NEXT: not a0, a0
; RV32XTHEADBB-NEXT: and a0, a0, a1
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
; RV32XTHEADBB-NEXT: li a1, 32
-; RV32XTHEADBB-NEXT: .LBB3_3:
+; RV32XTHEADBB-NEXT: .LBB3_5: # %cond.false
; RV32XTHEADBB-NEXT: sub a0, a1, a0
; RV32XTHEADBB-NEXT: li a1, 0
; RV32XTHEADBB-NEXT: ret
@@ -1348,14 +1360,17 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: test_ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB11_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: bnez a1, .LBB11_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: bnez a1, .LBB11_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -1385,7 +1400,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB11_2:
+; RV32I-NEXT: .LBB11_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB11_4:
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -1468,6 +1487,9 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
;
; RV32M-LABEL: test_ctlz_i64:
; RV32M: # %bb.0:
+; RV32M-NEXT: or a2, a0, a1
+; RV32M-NEXT: beqz a2, .LBB11_3
+; RV32M-NEXT: # %bb.1: # %cond.false
; RV32M-NEXT: lui a2, 349525
; RV32M-NEXT: lui a3, 209715
; RV32M-NEXT: lui a6, 61681
@@ -1476,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32M-NEXT: addi a4, a3, 819
; RV32M-NEXT: addi a3, a6, -241
; RV32M-NEXT: addi a2, a7, 257
-; RV32M-NEXT: bnez a1, .LBB11_2
-; RV32M-NEXT: # %bb.1:
+; RV32M-NEXT: bnez a1, .LBB11_4
+; RV32M-NEXT: # %bb.2: # %cond.false
; RV32M-NEXT: srli a1, a0, 1
; RV32M-NEXT: or a0, a0, a1
; RV32M-NEXT: srli a1, a0, 2
@@ -1504,7 +1526,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32M-NEXT: addi a0, a0, 32
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
-; RV32M-NEXT: .LBB11_2:
+; RV32M-NEXT: .LBB11_3:
+; RV32M-NEXT: li a1, 0
+; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: ret
+; RV32M-NEXT: .LBB11_4:
; RV32M-NEXT: srli a0, a1, 1
; RV32M-NEXT: or a0, a1, a0
; RV32M-NEXT: srli a1, a0, 2
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index 04a2f67c4942b..723437a610ff8 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -61,14 +61,17 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-LABEL: ctlz_i64:
; RV32I: # %bb.0:
+; RV32I-NEXT: or a2, a0, a1
+; RV32I-NEXT: beqz a2, .LBB1_3
+; RV32I-NEXT: # %bb.1: # %cond.false
; RV32I-NEXT: lui a2, 349525
; RV32I-NEXT: lui a3, 209715
; RV32I-NEXT: lui a5, 61681
; RV32I-NEXT: addi a4, a2, 1365
; RV32I-NEXT: addi a3, a3, 819
; RV32I-NEXT: addi a2, a5, -241
-; RV32I-NEXT: bnez a1, .LBB1_2
-; RV32I-NEXT: # %bb.1:
+; RV32I-NEXT: bnez a1, .LBB1_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a1, a0, 1
; RV32I-NEXT: or a0, a0, a1
; RV32I-NEXT: srli a1, a0, 2
@@ -98,7 +101,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
-; RV32I-NEXT: .LBB1_2:
+; RV32I-NEXT: .LBB1_3:
+; RV32I-NEXT: li a1, 0
+; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: ret
+; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a0, a1, 1
; RV32I-NEXT: or a0, a1, a0
; RV32I-NEXT: srli a1, a0, 2
@@ -200,39 +207,42 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv s2, a1
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: neg a0, a0
-; RV32I-NEXT: and a0, s0, a0
-; RV32I-NEXT: lui a1, 30667
-; RV32I-NEXT: addi s3, a1, 1329
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: mv s0, a1
+; RV32I-NEXT: or a1, a0, a1
+; RV32I-NEXT: beqz a1, .LBB3_3
+; RV32I-NEXT: # %bb.1: # %cond.false
+; RV32I-NEXT: neg a1, a0
+; RV32I-NEXT: and a1, a0, a1
+; RV32I-NEXT: lui a2, 30667
+; RV32I-NEXT: addi s2, a2, 1329
+; RV32I-NEXT: mv s4, a0
+; RV32I-NEXT: mv a0, a1
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui s4, %hi(.LCPI3_0)
-; RV32I-NEXT: addi s4, s4, %lo(.LCPI3_0)
-; RV32I-NEXT: neg a0, s2
-; RV32I-NEXT: and a0, s2, a0
-; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: lui s3, %hi(.LCPI3_0)
+; RV32I-NEXT: addi s3, s3, %lo(.LCPI3_0)
+; RV32I-NEXT: neg a0, s0
+; RV32I-NEXT: and a0, s0, a0
+; RV32I-NEXT: mv a1, s2
; RV32I-NEXT: call __mulsi3
-; RV32I-NEXT: bnez s2, .LBB3_3
-; RV32I-NEXT: # %bb.1:
-; RV32I-NEXT: li a0, 32
-; RV32I-NEXT: beqz s0, .LBB3_4
-; RV32I-NEXT: .LBB3_2:
-; RV32I-NEXT: srli s1, s1, 27
-; RV32I-NEXT: add s1, s4, s1
-; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: j .LBB3_5
-; RV32I-NEXT: .LBB3_3:
+; RV32I-NEXT: bnez s4, .LBB3_4
+; RV32I-NEXT: # %bb.2: # %cond.false
; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: add a0, s4, a0
+; RV32I-NEXT: add a0, s3, a0
; RV32I-NEXT: lbu a0, 0(a0)
-; RV32I-NEXT: bnez s0, .LBB3_2
-; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: addi ...
[truncated]
|
The code below the removed check looks generic enough to support arbitrary integer widths. This helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input.
2af83da to
2a33e28
Compare
becd66a to
13fe68d
Compare
|
@topperc Do RISC-V changes look fine? |
| ; CHECK-8MBASE-NEXT: bne .LBB3_7 | ||
| ; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false | ||
| ; CHECK-8MBASE-NEXT: beq .LBB3_8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW
There are some strange branches here and below (on the left, too).
X86 eliminates them in X86TargetLowering::EmitLoweredSelect. AFACT ARM's equivalent doesn't do this optimization.
There is also a PPCBranchCoalescing pass the could could help here. It is more or less generic, but currently lives under PowerPC directory.
|
ping, someone? |
lukel97
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RISC-V changes LGTM
| ; RV32M-NEXT: add a1, a2, a1 | ||
| ; RV32M-NEXT: lbu a1, 0(a1) | ||
| ; RV32M-NEXT: bnez a0, .LBB3_2 | ||
| ; RV32M-NEXT: li a1, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not related to this patch, but a1 needs to be zero to get here, can we remove this li?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is something that could be handled by RISCVRedundantCopyElimination (after some improvements like supporting AND/OR), but the context for the optimization is created later, by TailDuplicatePass. (It is +14 passes later.)
bb.0 (%ir-block.0):
successors: %bb.1(0x30000000), %bb.3(0x50000000); %bb.1(37.50%), %bb.3(62.50%)
liveins: $x10, $x11
renamable $x12 = OR renamable $x10, renamable $x11
BNE killed renamable $x12, $x0, %bb.3
bb.1:
; predecessors: %bb.0
renamable $x11 = COPY $x0
renamable $x10 = ADDI $x0, 64
I think the extra li may not be a big problem here as the result of cttz is usually truncated to 32 bits.
|
SPARC changes seem OK |
The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input. Pull Request: llvm#137197
…#137197) The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input. Pull Request: llvm/llvm-project#137197
The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input. Pull Request: llvm#137197
The code below the removed check looks generic enough to support arbitrary integer widths. This change helps 32-bit targets avoid expensive expansion/libcalls in the case of zero input.