Skip to content
63 changes: 63 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51541,6 +51541,64 @@ static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
return SDValue();
}

/// Optimize (Constant XOR a) & b & ~c -> (Constant XOR a) & (b & ~c)
/// This allows the andn operation to be done in parallel with the xor
static SDValue combineConstantXorAndAndNot(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace llvm::SDPatternMatch;

EVT VT = N->getValueType(0);
// Only handle scalar integer types that support BMI instructions
if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
return SDValue();

SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

// Check if N0 is AND(XOR(Constant, a), b)
if (N0.getOpcode() != ISD::AND)
return SDValue();

SDValue AndLHS = N0.getOperand(0);
SDValue AndRHS = N0.getOperand(1);

// Check if one operand is XOR(Constant, a)
SDValue XorOp, OtherOp;
if (AndLHS.getOpcode() == ISD::XOR) {
XorOp = AndLHS;
OtherOp = AndRHS;
} else if (AndRHS.getOpcode() == ISD::XOR) {
XorOp = AndRHS;
OtherOp = AndLHS;
} else {
return SDValue();
}

// Check if XOR has a constant operand
if (!isa<ConstantSDNode>(XorOp.getOperand(0)) &&
!isa<ConstantSDNode>(XorOp.getOperand(1))) {
return SDValue();
}

// Check if N1 is NOT(c) - i.e., XOR(c, -1)
SDValue NotOp;
if (N1.getOpcode() == ISD::XOR && isAllOnesConstant(N1.getOperand(1))) {
NotOp = N1.getOperand(0);
} else {
return SDValue();
}

// Transform: AND(AND(XOR(Constant, a), b), NOT(c))
// To: AND(XOR(Constant, a), AND(b, NOT(c)))
// This allows the andn (b & ~c) to be done in parallel with the xor

// Create AND(b, NOT(c)) - this will become andn
SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, OtherOp, N1);
// Create final AND(XOR(Constant, a), AND(b, NOT(c)))
return DAG.getNode(ISD::AND, DL, VT, XorOp, NewAnd);
}

/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL,
SelectionDAG &DAG,
Expand Down Expand Up @@ -51833,6 +51891,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
return R;

// Optimize (Constant XOR a) & b & ~c -> (Constant XOR a) & (b & ~c)
// This allows the andn operation to be done in parallel with the xor
if (SDValue R = combineConstantXorAndAndNot(N, dl, DAG, Subtarget))
return R;

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
// iff c2 is all/no bits mask - i.e. a select-with-zero mask.
// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/X86/constant-xor-and-andnot.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s

; Test the optimization described in issue #161630:
; (Constant XOR a) & b & ~c should compile to allow andn to be done in parallel with xor

define i64 @test_constant_xor_and_andnot(i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: test_constant_xor_and_andnot:
; CHECK: # %bb.0:
; CHECK-NEXT: xorq $1234, %rdi # imm = 0x4D2
; CHECK-NEXT: andnq %rsi, %rdx, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: retq
%xor = xor i64 %a, 1234
%and1 = and i64 %xor, %b
%not_c = xor i64 %c, -1
%result = and i64 %and1, %not_c
ret i64 %result
}

define i32 @test_constant_xor_and_andnot_32(i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: test_constant_xor_and_andnot_32:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl $5678, %edi # imm = 0x162E
; CHECK-NEXT: andnl %esi, %edx, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: retq
%xor = xor i32 %a, 5678
%and1 = and i32 %xor, %b
%not_c = xor i32 %c, -1
%result = and i32 %and1, %not_c
ret i32 %result
}

; Test with different operand order
define i64 @test_constant_xor_and_andnot_swapped(i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: test_constant_xor_and_andnot_swapped:
; CHECK: # %bb.0:
; CHECK-NEXT: xorq $1234, %rdi # imm = 0x4D2
; CHECK-NEXT: andnq %rsi, %rdx, %rax
; CHECK-NEXT: andq %rdi, %rax
; CHECK-NEXT: retq
%xor = xor i64 %a, 1234
%and1 = and i64 %b, %xor
%not_c = xor i64 %c, -1
%result = and i64 %and1, %not_c
ret i64 %result
}

; Test with different operand order for the final AND
define i64 @test_constant_xor_and_andnot_final_swapped(i64 %a, i64 %b, i64 %c) {
; CHECK-LABEL: test_constant_xor_and_andnot_final_swapped:
; CHECK: # %bb.0:
; CHECK-NEXT: xorq $1234, %rdi # imm = 0x4D2
; CHECK-NEXT: andq %rsi, %rdi
; CHECK-NEXT: andnq %rdi, %rdx, %rax
; CHECK-NEXT: retq
%xor = xor i64 %a, 1234
%and1 = and i64 %xor, %b
%not_c = xor i64 %c, -1
%result = and i64 %not_c, %and1
ret i64 %result
}
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/pr108731.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ define i64 @test_i64(i64 %w, i64 %x, i64 %y, i64 %z) {
; BMI-LABEL: test_i64:
; BMI: # %bb.0: # %Entry
; BMI-NEXT: andq %rdx, %rsi
; BMI-NEXT: andnq %rdi, %rsi, %rax
; BMI-NEXT: andnq %rcx, %rdx, %rcx
; BMI-NEXT: andnq %rax, %rcx, %rax
; BMI-NEXT: andnq %rcx, %rdx, %rax
; BMI-NEXT: andnq %rdi, %rax, %rax
; BMI-NEXT: andnq %rax, %rsi, %rax
; BMI-NEXT: retq
Entry:
%and1 = and i64 %y, %x
Expand All @@ -46,9 +46,9 @@ define i32 @test_i32(i32 %w, i32 %x, i32 %y, i32 %z) {
; BMI-LABEL: test_i32:
; BMI: # %bb.0: # %Entry
; BMI-NEXT: andl %edx, %esi
; BMI-NEXT: andnl %edi, %esi, %eax
; BMI-NEXT: andnl %ecx, %edx, %ecx
; BMI-NEXT: andnl %eax, %ecx, %eax
; BMI-NEXT: andnl %ecx, %edx, %eax
; BMI-NEXT: andnl %edi, %eax, %eax
; BMI-NEXT: andnl %eax, %esi, %eax
; BMI-NEXT: retq
Entry:
%and1 = and i32 %y, %x
Expand Down
Loading