-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[X86] Fold generic ADD/SUB with constants to X86ISD::SUB/ADD #164316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Brandon (brandonxin) ChangesFix #163125 This PR enhances
for.body2:
; ......
; This generates `add t6, Constant:i64<1>`
%indvars.iv.next = add nsw i64 %indvars.iv, 1;
; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add`
%cmp = icmp slt i64 %indvars.iv, -1;
br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit- ; CHECK-NEXT: movq (%r15), %rax
- ; CHECK-NEXT: movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT: leaq 1(%r13), %rdx
- ; CHECK-NEXT: cmpq $-1, %r13
- ; CHECK-NEXT: movq %rdx, %r13
+ ; CHECK-NEXT: movq (%r12), %rax
+ ; CHECK-NEXT: movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT: incq %r9Full diff: https://github.com/llvm/llvm-project/pull/164316.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee50cba3d..74a7d83aadfd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57616,10 +57616,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
}
// Fold any similar generic ADD/SUB opcodes to reuse this node.
- auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+ auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {
SDValue Ops[] = {N0, N1};
SDVTList VTs = DAG.getVTList(N->getValueType(0));
- if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+ if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {
SDValue Op(N, 0);
if (Negate) {
// Bail if this is only used by a user of the x86 add/sub.
@@ -57631,8 +57631,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
DCI.CombineTo(GenericAddSub, Op);
}
};
- MatchGeneric(LHS, RHS, false);
- MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+ MatchGeneric(GenericOpc, LHS, RHS, false);
+ MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+ if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
+ SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+ if (X86ISD::SUB == N->getOpcode()) {
+ // With LHS - C, fold LHS + (-C)
+ MatchGeneric(ISD::ADD, LHS, NegC, false);
+ } else {
+ // With -(LHS + C), fold (-C) - LHS
+ MatchGeneric(ISD::SUB, NegC, LHS, true);
+ }
+ } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(LHS)) {
+ SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+ if (X86ISD::SUB == N->getOpcode()) {
+ // With -(C - RHS), fold RHS + (-C)
+ MatchGeneric(ISD::ADD, RHS, NegC, true);
+ }
+ }
// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
// EFLAGS result doesn't change.
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index 22417363f1093..a2aaea31aa6ff 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
ret i32 %sum
}
+define i32 @adc_merge_sub(i32 %a0) nounwind {
+; X86-LABEL: adc_merge_sub:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: addl $42, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: adc_merge_sub:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl %edi, %ebx
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: addl $42, %ebx
+; X64-NEXT: setb %dil
+; X64-NEXT: movl %ebx, %ebp
+; X64-NEXT: negl %ebp
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %ebx, %ebp
+; X64-NEXT: movl %ebp, %eax
+; X64-NEXT: addq $8, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+ %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42)
+ %carry = extractvalue { i8, i32 } %adc, 0
+ call void @use(i8 %carry)
+ %sum = extractvalue { i8, i32 } %adc, 1
+ %sub = sub i32 -42, %a0
+ %result = xor i32 %sum, %sub
+ ret i32 %result
+}
+
declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 89aee965a2c1f..62744d4f3050a 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind {
ret i32 %10
}
+define i32 @sbb_merge_add1(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add1:
+; X86: # %bb.0:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT: setb %al
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: sbb_merge_add1:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl $42, %edi
+; X64-NEXT: setb %al
+; X64-NEXT: movl %eax, %edi
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+ %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42)
+ %borrow = extractvalue { i8, i32 } %sbb, 0
+ call void @use(i8 %borrow)
+ %diff = extractvalue { i8, i32 } %sbb, 1
+ %add = add i32 %a0, -42
+ %result = xor i32 %diff, %add
+ ret i32 %result
+}
+
+define i32 @sbb_merge_add2(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl $42, %edi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: setb %al
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll use@PLT
+; X86-NEXT: addl $4, %esp
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: sbb_merge_add2:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %rax
+; X64-NEXT: movl $42, %ebp
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %ebp
+; X64-NEXT: setb %al
+; X64-NEXT: movl %ebp, %ebx
+; X64-NEXT: negl %ebx
+; X64-NEXT: movl %eax, %edi
+; X64-NEXT: callq use@PLT
+; X64-NEXT: xorl %ebp, %ebx
+; X64-NEXT: movl %ebx, %eax
+; X64-NEXT: addq $8, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+ %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0)
+ %borrow = extractvalue { i8, i32 } %sbb, 0
+ call void @use(i8 %borrow)
+ %diff = extractvalue { i8, i32 } %sbb, 1
+ %add = add i32 %a0, -42
+ %result = xor i32 %diff, %add
+ ret i32 %result
+}
+
declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index b428ce457ff40..71ad598abe683 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -96,6 +96,17 @@ entry:
define void @_Z2x6v() local_unnamed_addr {
; CHECK-LABEL: _Z2x6v:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax
+; CHECK-NEXT: movl (%rax), %edx
+; CHECK-NEXT: andl $511, %edx # imm = 0x1FF
+; CHECK-NEXT: leaq 1(%rdx), %rax
+; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx
+; CHECK-NEXT: movl %eax, (%rcx)
+; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx
+; CHECK-NEXT: movl (%rcx), %ecx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: je .LBB1_18
+; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r15
@@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq x1@GOTPCREL(%rip), %rax
-; CHECK-NEXT: movl (%rax), %ebx
-; CHECK-NEXT: andl $511, %ebx # imm = 0x1FF
-; CHECK-NEXT: leaq 1(%rbx), %rax
-; CHECK-NEXT: movq x4@GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movl %eax, (%rcx)
-; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rcx
-; CHECK-NEXT: movl (%rcx), %ecx
-; CHECK-NEXT: testl %ecx, %ecx
-; CHECK-NEXT: je .LBB1_18
-; CHECK-NEXT: # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rdx
-; CHECK-NEXT: movq (%rdx), %rsi
-; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: notl %edx
-; CHECK-NEXT: leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT: movq x5@GOTPCREL(%rip), %rsi
+; CHECK-NEXT: movq (%rsi), %rsi
+; CHECK-NEXT: movl %ecx, %edi
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: leaq 8(,%rdi,8), %rdi
; CHECK-NEXT: imulq %rax, %rdi
; CHECK-NEXT: addq %rsi, %rdi
; CHECK-NEXT: movq x2@GOTPCREL(%rip), %r8
-; CHECK-NEXT: movl (%r8), %edx
-; CHECK-NEXT: leal 8(,%rbx,8), %eax
+; CHECK-NEXT: movl (%r8), %r9d
+; CHECK-NEXT: leal 8(,%rdx,8), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 32(%rsi), %r11
-; CHECK-NEXT: leaq 8(,%rbx,8), %rbx
-; CHECK-NEXT: xorl %r14d, %r14d
-; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r15
-; CHECK-NEXT: movq %rsi, %r12
+; CHECK-NEXT: leaq 32(%rsi), %rbx
+; CHECK-NEXT: leaq 8(,%rdx,8), %r14
+; CHECK-NEXT: xorl %r15d, %r15d
+; CHECK-NEXT: movq x0@GOTPCREL(%rip), %r12
+; CHECK-NEXT: movq %rsi, %r13
; CHECK-NEXT: jmp .LBB1_2
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_15: # %for.cond1.for.inc3_crit_edge
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movl %edx, (%r8)
+; CHECK-NEXT: movl %r9d, (%r8)
; CHECK-NEXT: .LBB1_16: # %for.inc3
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %rbx, %r12
-; CHECK-NEXT: incq %r14
-; CHECK-NEXT: addq %rbx, %r11
+; CHECK-NEXT: addq %r14, %r13
+; CHECK-NEXT: incq %r15
+; CHECK-NEXT: addq %r14, %rbx
; CHECK-NEXT: incl %ecx
; CHECK-NEXT: je .LBB1_17
; CHECK-NEXT: .LBB1_2: # %for.cond1thread-pre-split
; CHECK-NEXT: # =>This Loop Header: Depth=1
; CHECK-NEXT: # Child Loop BB1_12 Depth 2
; CHECK-NEXT: # Child Loop BB1_14 Depth 2
-; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: testl %r9d, %r9d
; CHECK-NEXT: jns .LBB1_16
; CHECK-NEXT: # %bb.3: # %for.body2.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: movslq %edx, %r13
-; CHECK-NEXT: testq %r13, %r13
+; CHECK-NEXT: movslq %r9d, %r9
+; CHECK-NEXT: testq %r9, %r9
; CHECK-NEXT: movq $-1, %rbp
-; CHECK-NEXT: cmovnsq %r13, %rbp
-; CHECK-NEXT: subq %r13, %rbp
+; CHECK-NEXT: cmovnsq %r9, %rbp
+; CHECK-NEXT: subq %r9, %rbp
; CHECK-NEXT: incq %rbp
; CHECK-NEXT: cmpq $4, %rbp
; CHECK-NEXT: jb .LBB1_14
@@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: # %bb.5: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT: imulq %r14, %rax
-; CHECK-NEXT: leaq (%rsi,%rax), %r10
-; CHECK-NEXT: leaq (%r10,%r13,8), %r9
-; CHECK-NEXT: testq %r13, %r13
-; CHECK-NEXT: movq $-1, %r10
-; CHECK-NEXT: cmovnsq %r13, %r10
-; CHECK-NEXT: cmpq %r15, %r9
+; CHECK-NEXT: imulq %r15, %rax
+; CHECK-NEXT: leaq (%rsi,%rax), %r11
+; CHECK-NEXT: leaq (%r11,%r9,8), %r10
+; CHECK-NEXT: testq %r9, %r9
+; CHECK-NEXT: movq $-1, %r11
+; CHECK-NEXT: cmovnsq %r9, %r11
+; CHECK-NEXT: cmpq %r12, %r10
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: leaq 8(%rsi), %r9
-; CHECK-NEXT: addq %r9, %rax
-; CHECK-NEXT: leaq (%rax,%r10,8), %rax
-; CHECK-NEXT: cmpq %r15, %rax
+; CHECK-NEXT: leaq 8(%rsi), %r10
+; CHECK-NEXT: addq %r10, %rax
+; CHECK-NEXT: leaq (%rax,%r11,8), %rax
+; CHECK-NEXT: cmpq %r12, %rax
; CHECK-NEXT: ja .LBB1_14
; CHECK-NEXT: .LBB1_7: # %vector.body.preheader
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
@@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT: movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT: movl $4, %r10d
+; CHECK-NEXT: movdqu %xmm0, (%r13,%r9,8)
+; CHECK-NEXT: movdqu %xmm0, 16(%r13,%r9,8)
+; CHECK-NEXT: movl $4, %r11d
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: jne .LBB1_11
; CHECK-NEXT: jmp .LBB1_13
; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: xorl %r10d, %r10d
+; CHECK-NEXT: xorl %r11d, %r11d
; CHECK-NEXT: shrq $2, %rax
; CHECK-NEXT: je .LBB1_13
; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: movq %r11, %rax
; CHECK-NEXT: subq %rdx, %rax
-; CHECK-NEXT: addq %r13, %r10
-; CHECK-NEXT: leaq (%r11,%r10,8), %r10
+; CHECK-NEXT: addq %r9, %r11
+; CHECK-NEXT: leaq (%rbx,%r11,8), %r11
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_12: # %vector.body
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movdqu %xmm0, -32(%r10)
-; CHECK-NEXT: movdqu %xmm0, -16(%r10)
-; CHECK-NEXT: movdqu %xmm0, (%r10)
-; CHECK-NEXT: movdqu %xmm0, 16(%r10)
-; CHECK-NEXT: addq $64, %r10
+; CHECK-NEXT: movdqu %xmm0, -32(%r11)
+; CHECK-NEXT: movdqu %xmm0, -16(%r11)
+; CHECK-NEXT: movdqu %xmm0, (%r11)
+; CHECK-NEXT: movdqu %xmm0, 16(%r11)
+; CHECK-NEXT: addq $64, %r11
; CHECK-NEXT: addq $8, %rax
; CHECK-NEXT: jne .LBB1_12
; CHECK-NEXT: .LBB1_13: # %middle.block
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq %rdx, %r13
+; CHECK-NEXT: addq %rdx, %r9
; CHECK-NEXT: cmpq %rdx, %rbp
-; CHECK-NEXT: movq %r13, %rdx
; CHECK-NEXT: je .LBB1_15
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_14: # %for.body2
; CHECK-NEXT: # Parent Loop BB1_2 Depth=1
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movq (%r15), %rax
-; CHECK-NEXT: movq %rax, (%r12,%r13,8)
-; CHECK-NEXT: leaq 1(%r13), %rdx
-; CHECK-NEXT: cmpq $-1, %r13
-; CHECK-NEXT: movq %rdx, %r13
+; CHECK-NEXT: movq (%r12), %rax
+; CHECK-NEXT: movq %rax, (%r13,%r9,8)
+; CHECK-NEXT: incq %r9
; CHECK-NEXT: jl .LBB1_14
; CHECK-NEXT: jmp .LBB1_15
; CHECK-NEXT: .LBB1_17: # %for.cond.for.end5_crit_edge
@@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: movq %rdi, (%rax)
; CHECK-NEXT: movq x3@GOTPCREL(%rip), %rax
; CHECK-NEXT: movl $0, (%rax)
-; CHECK-NEXT: .LBB1_18: # %for.end5
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: popq %r12
@@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: .cfi_restore %rbx
+; CHECK-NEXT: .cfi_restore %r12
+; CHECK-NEXT: .cfi_restore %r13
+; CHECK-NEXT: .cfi_restore %r14
+; CHECK-NEXT: .cfi_restore %r15
+; CHECK-NEXT: .cfi_restore %rbp
+; CHECK-NEXT: .LBB1_18: # %for.end5
; CHECK-NEXT: retq
entry:
%0 = load i32, ptr @x1, align 4
|
| if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) { | ||
| SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); | ||
| if (X86ISD::SUB == N->getOpcode()) { | ||
| // With LHS - C, fold LHS + (-C) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to handle -(LHS - C)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think LHS - C will be cananicalized to LHS + (-C) by DAGCombiner::visitSUB first, so we don't need to handle it here.
https://github.com/llvm/llvm-project/blob/683e2bf059a6e5e0bb9dc2628218b53dc2d1b490/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L4160-L4163
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If it is cananicalized, why we still see LHS - C here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What we see here is an X86ISD::SUB, which sets eflags differently than X86ISD::ADD, so an X86ISD::SUB LHS, C cannot be canonicalized to X86ISD::ADD LHS, -C. Therefore we explicitly check for the X86ISD::SUB pattern here.
In contrast, a generic ISD::SUB LHS, C is equivalent to ISD::ADD LHS, -C and is canonicalized by the DAG, so we don't need to look for the ISD::SUB LHS, C to fold.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am new to the backend part, so please correct me if I am wrong. Thank you!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My initial question was do we need to handle -(LHS - C) in the X86ISD::SUB case, I have this question because I see you handled LHS - C and -(C - RHS).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean with -X86ISD::SUB(LHS, C) we can fold the generic sub(C, LHS)? Yes we can. It is handled by the MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); above at line 57635.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get the point, thanks!
| if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) { | ||
| SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT); | ||
| if (X86ISD::SUB == N->getOpcode()) { | ||
| // With LHS - C, fold LHS + (-C) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I get the point, thanks!
| // With LHS - C, fold LHS + (-C) | ||
| MatchGeneric(ISD::ADD, LHS, NegC, false); | ||
| } else { | ||
| // With -(LHS + C), fold (-C) - LHS |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the comment is missleading. We are folding LHS + C to -((-C) - LHS). We are creating a new Negate instead of eliminating.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Thanks for pointing that out. I have modified the comments to reflect that the generic op is being replaced, not simply eliminated.
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor - thanks
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a couple more minors
…4316) Fix llvm#163125 This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases: - `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)` - `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)` - `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)` `CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded: ```llvm for.body2: ; ...... ; This generates `add t6, Constant:i64<1>` %indvars.iv.next = add nsw i64 %indvars.iv, 1; ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add` %cmp = icmp slt i64 %indvars.iv, -1; br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit ``` ```diff - ; CHECK-NEXT: movq (%r15), %rax - ; CHECK-NEXT: movq %rax, (%r12,%r13,8) - ; CHECK-NEXT: leaq 1(%r13), %rdx - ; CHECK-NEXT: cmpq $-1, %r13 - ; CHECK-NEXT: movq %rdx, %r13 + ; CHECK-NEXT: movq (%r12), %rax + ; CHECK-NEXT: movq %rax, (%r13,%r9,8) + ; CHECK-NEXT: incq %r9 ```
…4316) Fix llvm#163125 This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases: - `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)` - `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)` - `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)` `CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded: ```llvm for.body2: ; ...... ; This generates `add t6, Constant:i64<1>` %indvars.iv.next = add nsw i64 %indvars.iv, 1; ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add` %cmp = icmp slt i64 %indvars.iv, -1; br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit ``` ```diff - ; CHECK-NEXT: movq (%r15), %rax - ; CHECK-NEXT: movq %rax, (%r12,%r13,8) - ; CHECK-NEXT: leaq 1(%r13), %rdx - ; CHECK-NEXT: cmpq $-1, %r13 - ; CHECK-NEXT: movq %rdx, %r13 + ; CHECK-NEXT: movq (%r12), %rax + ; CHECK-NEXT: movq %rax, (%r13,%r9,8) + ; CHECK-NEXT: incq %r9 ```
…4316) Fix llvm#163125 This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases: - `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)` - `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)` - `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)` `CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded: ```llvm for.body2: ; ...... ; This generates `add t6, Constant:i64<1>` %indvars.iv.next = add nsw i64 %indvars.iv, 1; ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add` %cmp = icmp slt i64 %indvars.iv, -1; br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit ``` ```diff - ; CHECK-NEXT: movq (%r15), %rax - ; CHECK-NEXT: movq %rax, (%r12,%r13,8) - ; CHECK-NEXT: leaq 1(%r13), %rdx - ; CHECK-NEXT: cmpq $-1, %r13 - ; CHECK-NEXT: movq %rdx, %r13 + ; CHECK-NEXT: movq (%r12), %rax + ; CHECK-NEXT: movq %rax, (%r13,%r9,8) + ; CHECK-NEXT: incq %r9 ```
Fix #163125
This PR enhances
combineX86AddSubso that it can handleX86ISD::SUB(X,Constant)withadd(X,-Constant)and other similar cases:X86ISD::ADD(LHS, C)will foldsub(-C, LHS)X86ISD::SUB(LHS, C)will foldadd(LHS, -C)X86ISD::SUB(C, RHS)will foldadd(RHS, -C)CodeGen/X86/dag-update-nodetomatch.llis updated because following IR is folded: