Skip to content

Conversation

@brandonxin
Copy link
Contributor

@brandonxin brandonxin commented Oct 20, 2025

Fix #163125

This PR enhances combineX86AddSub so that it can handle X86ISD::SUB(X,Constant) with add(X,-Constant) and other similar cases:

  • X86ISD::ADD(LHS, C) will fold sub(-C, LHS)
  • X86ISD::SUB(LHS, C) will fold add(LHS, -C)
  • X86ISD::SUB(C, RHS) will fold add(RHS, -C)

CodeGen/X86/dag-update-nodetomatch.ll is updated because following IR is folded:

for.body2:
  ; ......

  ; This generates `add t6, Constant:i64<1>`
  %indvars.iv.next = add nsw i64 %indvars.iv, 1;

  ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add`
  %cmp = icmp slt i64 %indvars.iv, -1; 

  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
- ; CHECK-NEXT:    movq (%r15), %rax
- ; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT:    leaq 1(%r13), %rdx
- ; CHECK-NEXT:    cmpq $-1, %r13
- ; CHECK-NEXT:    movq %rdx, %r13
+ ; CHECK-NEXT:    movq (%r12), %rax
+ ; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT:    incq %r9

@llvmbot
Copy link
Member

llvmbot commented Oct 20, 2025

@llvm/pr-subscribers-backend-x86

Author: Brandon (brandonxin)

Changes

Fix #163125

This PR enhances combineX86AddSub so that it can handle X86ISD::SUB(X,Constant) with add(X,-Constant) and other similar cases:

  • X86ISD::ADD(LHS, C) will fold sub(-C, LHS)
  • X86ISD::SUB(LHS, C) will fold add(LHS, -C)
  • X86ISD::SUB(C, RHS) will fold add(RHS, -C)

CodeGen/X86/dag-update-nodetomatch.ll is updated because following IR is fold:

for.body2:
  ; ......

  ; This generates `add t6, Constant:i64&lt;1&gt;`
  %indvars.iv.next = add nsw i64 %indvars.iv, 1;

  ; This generates `X86ISD::SUB t6, Constant:i64&lt;-1&gt;` and folds the previous `add`
  %cmp = icmp slt i64 %indvars.iv, -1; 

  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
- ; CHECK-NEXT:    movq (%r15), %rax
- ; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT:    leaq 1(%r13), %rdx
- ; CHECK-NEXT:    cmpq $-1, %r13
- ; CHECK-NEXT:    movq %rdx, %r13
+ ; CHECK-NEXT:    movq (%r12), %rax
+ ; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT:    incq %r9

Full diff: https://github.com/llvm/llvm-project/pull/164316.diff

4 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+21-4)
  • (modified) llvm/test/CodeGen/X86/combine-adc.ll (+48)
  • (modified) llvm/test/CodeGen/X86/combine-sbb.ll (+81)
  • (modified) llvm/test/CodeGen/X86/dag-update-nodetomatch.ll (+66-63)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee50cba3d..74a7d83aadfd9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57616,10 +57616,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
   }
 
   // Fold any similar generic ADD/SUB opcodes to reuse this node.
-  auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+  auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {
     SDValue Ops[] = {N0, N1};
     SDVTList VTs = DAG.getVTList(N->getValueType(0));
-    if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+    if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {
       SDValue Op(N, 0);
       if (Negate) {
         // Bail if this is only used by a user of the x86 add/sub.
@@ -57631,8 +57631,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
       DCI.CombineTo(GenericAddSub, Op);
     }
   };
-  MatchGeneric(LHS, RHS, false);
-  MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+  MatchGeneric(GenericOpc, LHS, RHS, false);
+  MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+  if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
+    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+    if (X86ISD::SUB == N->getOpcode()) {
+      // With LHS - C, fold LHS + (-C)
+      MatchGeneric(ISD::ADD, LHS, NegC, false);
+    } else {
+      // With -(LHS + C), fold (-C) - LHS
+      MatchGeneric(ISD::SUB, NegC, LHS, true);
+    }
+  } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(LHS)) {
+    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+    if (X86ISD::SUB == N->getOpcode()) {
+      // With -(C - RHS), fold RHS + (-C)
+      MatchGeneric(ISD::ADD, RHS, NegC, true);
+    }
+  }
 
   // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
   // EFLAGS result doesn't change.
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index 22417363f1093..a2aaea31aa6ff 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
   ret i32 %sum
 }
 
+define i32 @adc_merge_sub(i32 %a0) nounwind {
+; X86-LABEL: adc_merge_sub:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $42, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: adc_merge_sub:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    addl $42, %ebx
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movl %ebx, %ebp
+; X64-NEXT:    negl %ebp
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebx, %ebp
+; X64-NEXT:    movl %ebp, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42)
+  %carry = extractvalue { i8, i32 } %adc, 0
+  call void @use(i8 %carry)
+  %sum = extractvalue { i8, i32 } %adc, 1
+  %sub = sub i32 -42, %a0
+  %result = xor i32 %sum, %sub
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 89aee965a2c1f..62744d4f3050a 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind {
   ret i32 %10
 }
 
+define i32 @sbb_merge_add1(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add1:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add1:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $42, %edi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
+define i32 @sbb_merge_add2(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $42, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $42, %ebp
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    subl %edi, %ebp
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %ebp, %ebx
+; X64-NEXT:    negl %ebx
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebp, %ebx
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index b428ce457ff40..71ad598abe683 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -96,6 +96,17 @@ entry:
 define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-LABEL: _Z2x6v:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movl (%rax), %edx
+; CHECK-NEXT:    andl $511, %edx # imm = 0x1FF
+; CHECK-NEXT:    leaq 1(%rdx), %rax
+; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl %eax, (%rcx)
+; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl (%rcx), %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    je .LBB1_18
+; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %r15
@@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movl (%rax), %ebx
-; CHECK-NEXT:    andl $511, %ebx # imm = 0x1FF
-; CHECK-NEXT:    leaq 1(%rbx), %rax
-; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl %eax, (%rcx)
-; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl (%rcx), %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
-; CHECK-NEXT:    je .LBB1_18
-; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rdx
-; CHECK-NEXT:    movq (%rdx), %rsi
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    notl %edx
-; CHECK-NEXT:    leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rsi
+; CHECK-NEXT:    movq (%rsi), %rsi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    leaq 8(,%rdi,8), %rdi
 ; CHECK-NEXT:    imulq %rax, %rdi
 ; CHECK-NEXT:    addq %rsi, %rdi
 ; CHECK-NEXT:    movq x2@GOTPCREL(%rip), %r8
-; CHECK-NEXT:    movl (%r8), %edx
-; CHECK-NEXT:    leal 8(,%rbx,8), %eax
+; CHECK-NEXT:    movl (%r8), %r9d
+; CHECK-NEXT:    leal 8(,%rdx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 32(%rsi), %r11
-; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
-; CHECK-NEXT:    xorl %r14d, %r14d
-; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r15
-; CHECK-NEXT:    movq %rsi, %r12
+; CHECK-NEXT:    leaq 32(%rsi), %rbx
+; CHECK-NEXT:    leaq 8(,%rdx,8), %r14
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r12
+; CHECK-NEXT:    movq %rsi, %r13
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_15: # %for.cond1.for.inc3_crit_edge
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movl %edx, (%r8)
+; CHECK-NEXT:    movl %r9d, (%r8)
 ; CHECK-NEXT:  .LBB1_16: # %for.inc3
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rbx, %r12
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    addq %r14, %r13
+; CHECK-NEXT:    incq %r15
+; CHECK-NEXT:    addq %r14, %rbx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    je .LBB1_17
 ; CHECK-NEXT:  .LBB1_2: # %for.cond1thread-pre-split
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_12 Depth 2
 ; CHECK-NEXT:    # Child Loop BB1_14 Depth 2
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    testl %r9d, %r9d
 ; CHECK-NEXT:    jns .LBB1_16
 ; CHECK-NEXT:  # %bb.3: # %for.body2.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movslq %edx, %r13
-; CHECK-NEXT:    testq %r13, %r13
+; CHECK-NEXT:    movslq %r9d, %r9
+; CHECK-NEXT:    testq %r9, %r9
 ; CHECK-NEXT:    movq $-1, %rbp
-; CHECK-NEXT:    cmovnsq %r13, %rbp
-; CHECK-NEXT:    subq %r13, %rbp
+; CHECK-NEXT:    cmovnsq %r9, %rbp
+; CHECK-NEXT:    subq %r9, %rbp
 ; CHECK-NEXT:    incq %rbp
 ; CHECK-NEXT:    cmpq $4, %rbp
 ; CHECK-NEXT:    jb .LBB1_14
@@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:  # %bb.5: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    imulq %r14, %rax
-; CHECK-NEXT:    leaq (%rsi,%rax), %r10
-; CHECK-NEXT:    leaq (%r10,%r13,8), %r9
-; CHECK-NEXT:    testq %r13, %r13
-; CHECK-NEXT:    movq $-1, %r10
-; CHECK-NEXT:    cmovnsq %r13, %r10
-; CHECK-NEXT:    cmpq %r15, %r9
+; CHECK-NEXT:    imulq %r15, %rax
+; CHECK-NEXT:    leaq (%rsi,%rax), %r11
+; CHECK-NEXT:    leaq (%r11,%r9,8), %r10
+; CHECK-NEXT:    testq %r9, %r9
+; CHECK-NEXT:    movq $-1, %r11
+; CHECK-NEXT:    cmovnsq %r9, %r11
+; CHECK-NEXT:    cmpq %r12, %r10
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    leaq 8(%rsi), %r9
-; CHECK-NEXT:    addq %r9, %rax
-; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
-; CHECK-NEXT:    cmpq %r15, %rax
+; CHECK-NEXT:    leaq 8(%rsi), %r10
+; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    leaq (%rax,%r11,8), %rax
+; CHECK-NEXT:    cmpq %r12, %rax
 ; CHECK-NEXT:    ja .LBB1_14
 ; CHECK-NEXT:  .LBB1_7: # %vector.body.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
@@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT:    movl $4, %r10d
+; CHECK-NEXT:    movdqu %xmm0, (%r13,%r9,8)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r13,%r9,8)
+; CHECK-NEXT:    movl $4, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    jne .LBB1_11
 ; CHECK-NEXT:    jmp .LBB1_13
 ; CHECK-NEXT:  .LBB1_8: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    xorl %r10d, %r10d
+; CHECK-NEXT:    xorl %r11d, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    je .LBB1_13
 ; CHECK-NEXT:  .LBB1_11: # %vector.body.preheader.new
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    movq %r11, %rax
 ; CHECK-NEXT:    subq %rdx, %rax
-; CHECK-NEXT:    addq %r13, %r10
-; CHECK-NEXT:    leaq (%r11,%r10,8), %r10
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    leaq (%rbx,%r11,8), %r11
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_12: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movdqu %xmm0, -32(%r10)
-; CHECK-NEXT:    movdqu %xmm0, -16(%r10)
-; CHECK-NEXT:    movdqu %xmm0, (%r10)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r10)
-; CHECK-NEXT:    addq $64, %r10
+; CHECK-NEXT:    movdqu %xmm0, -32(%r11)
+; CHECK-NEXT:    movdqu %xmm0, -16(%r11)
+; CHECK-NEXT:    movdqu %xmm0, (%r11)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r11)
+; CHECK-NEXT:    addq $64, %r11
 ; CHECK-NEXT:    addq $8, %rax
 ; CHECK-NEXT:    jne .LBB1_12
 ; CHECK-NEXT:  .LBB1_13: # %middle.block
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rdx, %r13
+; CHECK-NEXT:    addq %rdx, %r9
 ; CHECK-NEXT:    cmpq %rdx, %rbp
-; CHECK-NEXT:    movq %r13, %rdx
 ; CHECK-NEXT:    je .LBB1_15
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_14: # %for.body2
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movq (%r15), %rax
-; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
-; CHECK-NEXT:    leaq 1(%r13), %rdx
-; CHECK-NEXT:    cmpq $-1, %r13
-; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    movq (%r12), %rax
+; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+; CHECK-NEXT:    incq %r9
 ; CHECK-NEXT:    jl .LBB1_14
 ; CHECK-NEXT:    jmp .LBB1_15
 ; CHECK-NEXT:  .LBB1_17: # %for.cond.for.end5_crit_edge
@@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movl $0, (%rax)
-; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_restore %rbx
+; CHECK-NEXT:    .cfi_restore %r12
+; CHECK-NEXT:    .cfi_restore %r13
+; CHECK-NEXT:    .cfi_restore %r14
+; CHECK-NEXT:    .cfi_restore %r15
+; CHECK-NEXT:    .cfi_restore %rbp
+; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i32, ptr @x1, align 4

@RKSimon RKSimon requested review from RKSimon and phoebewang October 20, 2025 21:11
if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
if (X86ISD::SUB == N->getOpcode()) {
// With LHS - C, fold LHS + (-C)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to handle -(LHS - C)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think LHS - C will be cananicalized to LHS + (-C) by DAGCombiner::visitSUB first, so we don't need to handle it here.
https://github.com/llvm/llvm-project/blob/683e2bf059a6e5e0bb9dc2628218b53dc2d1b490/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L4160-L4163

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is cananicalized, why we still see LHS - C here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we see here is an X86ISD::SUB, which sets eflags differently than X86ISD::ADD, so an X86ISD::SUB LHS, C cannot be canonicalized to X86ISD::ADD LHS, -C. Therefore we explicitly check for the X86ISD::SUB pattern here.

In contrast, a generic ISD::SUB LHS, C is equivalent to ISD::ADD LHS, -C and is canonicalized by the DAG, so we don't need to look for the ISD::SUB LHS, C to fold.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am new to the backend part, so please correct me if I am wrong. Thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My initial question was do we need to handle -(LHS - C) in the X86ISD::SUB case, I have this question because I see you handled LHS - C and -(C - RHS).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean with -X86ISD::SUB(LHS, C) we can fold the generic sub(C, LHS)? Yes we can. It is handled by the MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode()); above at line 57635.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the point, thanks!

if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
if (X86ISD::SUB == N->getOpcode()) {
// With LHS - C, fold LHS + (-C)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the point, thanks!

// With LHS - C, fold LHS + (-C)
MatchGeneric(ISD::ADD, LHS, NegC, false);
} else {
// With -(LHS + C), fold (-C) - LHS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the comment is missleading. We are folding LHS + C to -((-C) - LHS). We are creating a new Negate instead of eliminating.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Thanks for pointing that out. I have modified the comments to reflect that the generic op is being replaced, not simply eliminated.

Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with one minor - thanks

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a couple more minors

@RKSimon RKSimon enabled auto-merge (squash) October 24, 2025 07:45
@RKSimon RKSimon merged commit ea2de9a into llvm:main Oct 24, 2025
9 of 10 checks passed
dvbuka pushed a commit to dvbuka/llvm-project that referenced this pull request Oct 27, 2025
…4316)

Fix llvm#163125

This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases:
- `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)`
- `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)`
- `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)`

`CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded:

```llvm
for.body2:
  ; ......

  ; This generates `add t6, Constant:i64<1>`
  %indvars.iv.next = add nsw i64 %indvars.iv, 1;

  ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add`
  %cmp = icmp slt i64 %indvars.iv, -1; 

  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
```
```diff
- ; CHECK-NEXT:    movq (%r15), %rax
- ; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT:    leaq 1(%r13), %rdx
- ; CHECK-NEXT:    cmpq $-1, %r13
- ; CHECK-NEXT:    movq %rdx, %r13
+ ; CHECK-NEXT:    movq (%r12), %rax
+ ; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT:    incq %r9
```
Lukacma pushed a commit to Lukacma/llvm-project that referenced this pull request Oct 29, 2025
…4316)

Fix llvm#163125

This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases:
- `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)`
- `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)`
- `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)`

`CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded:

```llvm
for.body2:
  ; ......

  ; This generates `add t6, Constant:i64<1>`
  %indvars.iv.next = add nsw i64 %indvars.iv, 1;

  ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add`
  %cmp = icmp slt i64 %indvars.iv, -1; 

  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
```
```diff
- ; CHECK-NEXT:    movq (%r15), %rax
- ; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT:    leaq 1(%r13), %rdx
- ; CHECK-NEXT:    cmpq $-1, %r13
- ; CHECK-NEXT:    movq %rdx, %r13
+ ; CHECK-NEXT:    movq (%r12), %rax
+ ; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT:    incq %r9
```
aokblast pushed a commit to aokblast/llvm-project that referenced this pull request Oct 30, 2025
…4316)

Fix llvm#163125

This PR enhances `combineX86AddSub` so that it can handle `X86ISD::SUB(X,Constant)` with `add(X,-Constant)` and other similar cases:
- `X86ISD::ADD(LHS, C)` will fold `sub(-C, LHS)`
- `X86ISD::SUB(LHS, C)` will fold `add(LHS, -C)`
- `X86ISD::SUB(C, RHS)` will fold `add(RHS, -C)`

`CodeGen/X86/dag-update-nodetomatch.ll` is updated because following IR is folded:

```llvm
for.body2:
  ; ......

  ; This generates `add t6, Constant:i64<1>`
  %indvars.iv.next = add nsw i64 %indvars.iv, 1;

  ; This generates `X86ISD::SUB t6, Constant:i64<-1>` and folds the previous `add`
  %cmp = icmp slt i64 %indvars.iv, -1; 

  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
```
```diff
- ; CHECK-NEXT:    movq (%r15), %rax
- ; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
- ; CHECK-NEXT:    leaq 1(%r13), %rdx
- ; CHECK-NEXT:    cmpq $-1, %r13
- ; CHECK-NEXT:    movq %rdx, %r13
+ ; CHECK-NEXT:    movq (%r12), %rax
+ ; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+ ; CHECK-NEXT:    incq %r9
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

lea esi, [rdx - 32]+cmp edx, 32+mov edx, esi => sub edx, 32

4 participants