From b2ea0c39e7be54e0ab068d34dfb395a99a8902cb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 17 Nov 2025 16:34:29 +0000 Subject: [PATCH] [X86] combineTruncate - trunc(srl(load(p),amt)) -> load(p+amt/8) - ensure amt doesn't depend on original load chain Relax fix for #165755 / #165850 - it doesn't matter if the amt is dependent on the original load value, just any users of the chain --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +- llvm/test/CodeGen/X86/bittest-big-integer.ll | 272 +++++++------------ 2 files changed, 103 insertions(+), 176 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 621f1868d3311..864e5dc67682c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -54688,11 +54688,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); // Check the shift amount is byte aligned. // Check the truncation doesn't use any shifted in (zero) top bits. - // Check the shift amount doesn't depend on the original load. + // Check the shift amount doesn't depend on the original load chain. if (KnownAmt.countMinTrailingZeros() >= 3 && KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() - VT.getSizeInBits()) && - !Ld->isPredecessorOf(ShAmt.getNode())) { + none_of(Ld->uses(), [&ShAmt](SDUse &Use) { + return Use.getResNo() == 1 && + Use.getUser()->isPredecessorOf(ShAmt.getNode()); + })) { EVT PtrVT = Ld->getBasePtr().getValueType(); SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT); SDValue PtrByteOfs = diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index b85a20b9d6b6e..023fb5065b892 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1877,85 +1877,56 @@ define i32 @blsr_u512(ptr %word) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: pushq %r15 ; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r12 ; SSE-NEXT: pushq %rbx -; SSE-NEXT: pushq %rax -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq 48(%rdi), %rdx -; SSE-NEXT: movq 40(%rdi), %rsi -; SSE-NEXT: movq 32(%rdi), %r11 +; SSE-NEXT: movq 48(%rdi), %r11 +; SSE-NEXT: movq 40(%rdi), %r9 ; SSE-NEXT: movq 24(%rdi), %r8 -; SSE-NEXT: movq 16(%rdi), %r9 -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %r10 -; SSE-NEXT: rep bsfq %rax, %rbx -; SSE-NEXT: rep bsfq %r10, %r14 -; SSE-NEXT: addq $64, %r14 -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: cmovneq %rbx, %r14 -; SSE-NEXT: rep bsfq %r9, %r15 -; SSE-NEXT: rep bsfq %r8, %rbx +; SSE-NEXT: movq 16(%rdi), %rdx +; SSE-NEXT: movq (%rdi), %rcx +; SSE-NEXT: movq 8(%rdi), %rsi +; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: rep bsfq %rsi, %rbx ; SSE-NEXT: addq $64, %rbx -; SSE-NEXT: testq %r9, %r9 -; SSE-NEXT: cmovneq %r15, %rbx -; SSE-NEXT: subq $-128, %rbx -; SSE-NEXT: movq %rax, %r15 -; SSE-NEXT: movq %rax, %r12 -; SSE-NEXT: orq %r10, %r12 -; SSE-NEXT: cmovneq %r14, %rbx -; SSE-NEXT: rep bsfq %r11, %r12 -; SSE-NEXT: rep bsfq %rsi, %r14 -; SSE-NEXT: addq $64, %r14 -; SSE-NEXT: testq %r11, %r11 -; SSE-NEXT: cmovneq %r12, %r14 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: rep bsfq %rdx, %r12 +; SSE-NEXT: testq %rcx, %rcx +; SSE-NEXT: cmovneq %rax, %rbx +; SSE-NEXT: rep bsfq %rdx, %rax +; SSE-NEXT: rep bsfq %r8, %r10 +; SSE-NEXT: addq $64, %r10 +; SSE-NEXT: testq %rdx, %rdx +; SSE-NEXT: cmovneq %rax, %r10 +; SSE-NEXT: movq 32(%rdi), %r14 +; SSE-NEXT: subq $-128, %r10 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: orq %rsi, %rax +; SSE-NEXT: cmovneq %rbx, %r10 +; SSE-NEXT: rep bsfq %r14, %rax +; SSE-NEXT: rep bsfq %r9, %rbx +; SSE-NEXT: addq $64, %rbx +; SSE-NEXT: testq %r14, %r14 +; SSE-NEXT: cmovneq %rax, %rbx +; SSE-NEXT: rep bsfq %r11, %r15 ; SSE-NEXT: movl $64, %eax -; SSE-NEXT: rep bsfq %rcx, %rax +; SSE-NEXT: rep bsfq 56(%rdi), %rax ; SSE-NEXT: addq $64, %rax -; SSE-NEXT: testq %rdx, %rdx -; SSE-NEXT: cmovneq %r12, %rax +; SSE-NEXT: testq %r11, %r11 +; SSE-NEXT: cmovneq %r15, %rax ; SSE-NEXT: subq $-128, %rax -; SSE-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; SSE-NEXT: orq %rsi, %r11 -; SSE-NEXT: cmovneq %r14, %rax -; SSE-NEXT: addq $256, %rax # imm = 0x100 -; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; SSE-NEXT: orq %r8, %r10 -; SSE-NEXT: orq %r9, %r15 -; SSE-NEXT: orq %r10, %r15 +; SSE-NEXT: orq %r9, %r14 ; SSE-NEXT: cmovneq %rbx, %rax -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; SSE-NEXT: addq $256, %rax # imm = 0x100 +; SSE-NEXT: orq %r8, %rsi +; SSE-NEXT: orq %rdx, %rcx +; SSE-NEXT: orq %rsi, %rcx +; SSE-NEXT: cmovneq %r10, %rax +; SSE-NEXT: movl $-2, %edx +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: roll %cl, %edx ; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: andl $32, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: andl $480, %edx # imm = 0x1E0 -; SSE-NEXT: shrl $3, %edx -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: andl $-8, %esi -; SSE-NEXT: movq -128(%rsp,%rsi), %r8 -; SSE-NEXT: shrq %cl, %r8 -; SSE-NEXT: movl -120(%rsp,%rsi), %esi -; SSE-NEXT: addl %esi, %esi -; SSE-NEXT: notl %ecx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: orl %r8d, %esi -; SSE-NEXT: btrl %eax, %esi -; SSE-NEXT: movl %esi, (%rdi,%rdx) +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: andl $60, %ecx +; SSE-NEXT: andl %edx, (%rdi,%rcx) ; SSE-NEXT: # kill: def $eax killed $eax killed $rax -; SSE-NEXT: addq $8, %rsp ; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 ; SSE-NEXT: popq %r14 ; SSE-NEXT: popq %r15 ; SSE-NEXT: retq @@ -1964,133 +1935,86 @@ define i32 @blsr_u512(ptr %word) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %r15 ; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: movq 40(%rdi), %rdx -; AVX2-NEXT: movq 32(%rdi), %r11 -; AVX2-NEXT: movq 24(%rdi), %rsi -; AVX2-NEXT: movq 16(%rdi), %r8 -; AVX2-NEXT: movq (%rdi), %r9 -; AVX2-NEXT: movq 8(%rdi), %r10 -; AVX2-NEXT: xorl %ebx, %ebx -; AVX2-NEXT: tzcntq %r9, %rbx -; AVX2-NEXT: tzcntq %r10, %rax -; AVX2-NEXT: addq $64, %rax -; AVX2-NEXT: testq %r9, %r9 -; AVX2-NEXT: cmovneq %rbx, %rax -; AVX2-NEXT: xorl %r14d, %r14d -; AVX2-NEXT: tzcntq %r8, %r14 +; AVX2-NEXT: movq 40(%rdi), %r9 +; AVX2-NEXT: movq 32(%rdi), %r10 +; AVX2-NEXT: movq 24(%rdi), %r8 +; AVX2-NEXT: movq 16(%rdi), %rdx +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq 8(%rdi), %rsi +; AVX2-NEXT: tzcntq %rcx, %rax ; AVX2-NEXT: xorl %ebx, %ebx ; AVX2-NEXT: tzcntq %rsi, %rbx ; AVX2-NEXT: addq $64, %rbx -; AVX2-NEXT: testq %r8, %r8 -; AVX2-NEXT: cmovneq %r14, %rbx -; AVX2-NEXT: subq $-128, %rbx -; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: movq %r9, %r15 -; AVX2-NEXT: orq %r10, %r15 +; AVX2-NEXT: testq %rcx, %rcx ; AVX2-NEXT: cmovneq %rax, %rbx ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: tzcntq %r11, %rax -; AVX2-NEXT: xorl %r12d, %r12d -; AVX2-NEXT: tzcntq %rdx, %r12 -; AVX2-NEXT: addq $64, %r12 -; AVX2-NEXT: testq %r11, %r11 -; AVX2-NEXT: cmovneq %rax, %r12 -; AVX2-NEXT: movq 48(%rdi), %r15 -; AVX2-NEXT: xorl %r13d, %r13d -; AVX2-NEXT: tzcntq %r15, %r13 +; AVX2-NEXT: tzcntq %rdx, %rax +; AVX2-NEXT: tzcntq %r8, %r11 +; AVX2-NEXT: addq $64, %r11 +; AVX2-NEXT: testq %rdx, %rdx +; AVX2-NEXT: cmovneq %rax, %r11 +; AVX2-NEXT: subq $-128, %r11 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: cmovneq %rbx, %r11 ; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: tzcntq %rcx, %rax +; AVX2-NEXT: tzcntq %r10, %rax +; AVX2-NEXT: xorl %ebx, %ebx +; AVX2-NEXT: tzcntq %r9, %rbx +; AVX2-NEXT: addq $64, %rbx +; AVX2-NEXT: testq %r10, %r10 +; AVX2-NEXT: cmovneq %rax, %rbx +; AVX2-NEXT: movq 48(%rdi), %r14 +; AVX2-NEXT: xorl %r15d, %r15d +; AVX2-NEXT: tzcntq %r14, %r15 +; AVX2-NEXT: xorl %eax, %eax +; AVX2-NEXT: tzcntq 56(%rdi), %rax ; AVX2-NEXT: addq $64, %rax -; AVX2-NEXT: testq %r15, %r15 -; AVX2-NEXT: cmovneq %r13, %rax +; AVX2-NEXT: testq %r14, %r14 +; AVX2-NEXT: cmovneq %r15, %rax ; AVX2-NEXT: subq $-128, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: orq %rdx, %r11 -; AVX2-NEXT: cmovneq %r12, %rax -; AVX2-NEXT: addq $256, %rax # imm = 0x100 -; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: orq %rsi, %r10 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: orq %r10, %r14 +; AVX2-NEXT: orq %r9, %r10 ; AVX2-NEXT: cmovneq %rbx, %rax -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r15, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: addq $256, %rax # imm = 0x100 +; AVX2-NEXT: orq %r8, %rsi +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: cmovneq %r11, %rax +; AVX2-NEXT: movl $-2, %edx +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: roll %cl, %edx ; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $32, %ecx -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: andl $480, %edx # imm = 0x1E0 -; AVX2-NEXT: shrl $3, %edx -; AVX2-NEXT: movl %edx, %esi -; AVX2-NEXT: andl $-8, %esi -; AVX2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 -; AVX2-NEXT: notl %ecx -; AVX2-NEXT: movl -120(%rsp,%rsi), %esi -; AVX2-NEXT: addl %esi, %esi -; AVX2-NEXT: shlxq %rcx, %rsi, %rcx -; AVX2-NEXT: orl %r8d, %ecx -; AVX2-NEXT: btrl %eax, %ecx -; AVX2-NEXT: movl %ecx, (%rdi,%rdx) +; AVX2-NEXT: shrl $3, %ecx +; AVX2-NEXT: andl $60, %ecx +; AVX2-NEXT: andl %edx, (%rdi,%rcx) ; AVX2-NEXT: # kill: def $eax killed $eax killed $rax ; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: popq %r15 -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: blsr_u512: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm2 -; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = -1 -; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm3 -; AVX512-NEXT: vpandnq %zmm3, %zmm2, %zmm3 -; AVX512-NEXT: vplzcntq %zmm3, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] -; AVX512-NEXT: vpsubq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vptestmq %zmm2, %zmm2, %k1 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [512,512,512,512,512,512,512,512] -; AVX512-NEXT: vpcompressq %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovdqu %ymm2, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512] +; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: movl $-2, %edx +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: roll %cl, %edx ; AVX512-NEXT: movl %eax, %ecx -; AVX512-NEXT: andl $32, %ecx -; AVX512-NEXT: movl %ecx, %edx -; AVX512-NEXT: notl %edx -; AVX512-NEXT: movl %eax, %esi -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: movl %esi, %r8d -; AVX512-NEXT: andl $56, %r8d -; AVX512-NEXT: movl -120(%rsp,%r8), %r9d -; AVX512-NEXT: addl %r9d, %r9d -; AVX512-NEXT: shlxq %rdx, %r9, %rdx ; AVX512-NEXT: shrl $3, %ecx -; AVX512-NEXT: addq %rsp, %r8 -; AVX512-NEXT: addq $-128, %r8 -; AVX512-NEXT: orl (%rcx,%r8), %edx -; AVX512-NEXT: btrl %eax, %edx -; AVX512-NEXT: andl $60, %esi -; AVX512-NEXT: movl %edx, (%rdi,%rsi) +; AVX512-NEXT: andl $60, %ecx +; AVX512-NEXT: andl %edx, (%rdi,%rcx) ; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rcx ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %ld = load i512, ptr %word