Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49520,6 +49520,15 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

// VCOMXSS simplifies conditional code sequence into single setcc node.
// Earlier until COMI, it required upto 2 SETCC's to test CC.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clang-format.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the logic here. Why do we require 2 CC for comx? I don't see a test case using 2 CC. Can we just return SDValue() to break combine?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On First line, I mark intent to do this legalization. COMX tests more flags and we need Single SETCC Node to infer.
Earlier Attempts used chain of SETCC like in X86ISelLowering.cpp snippet

  case COMI: { // Comparison intrinsics
....
....
      SDValue SetCC;
      switch (CC) {
      case ISD::SETEQ: {
        SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);   // FIRST SETCC 
        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1
          break;
        // (ZF = 1 and PF = 0)
        SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);  // SECOND SETCC 
        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
        break;
      }

May be instead of 2 CC, I need to write 2 Flags. Is it?

If we return SDValue(), We select vucomiss so to get desired selection vucomxss we need to change this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Can we generate UCOMX there instead of generating 2 CC and then combine?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mahesh-attarde Did you investigate this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optimized type-legalized selection DAG: %bb.0 'oeq:'
SelectionDAG has 11 nodes:
  t0: ch,glue = EntryToken
      t2: f32,ch = CopyFromReg t0, Register:f32 %0
      t4: f32,ch = CopyFromReg t0, Register:f32 %1
    t14: i8 = setcc t2, t4, setoeq:ch
  t10: ch,glue = CopyToReg t0, Register:i8 $al, t14
  t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1

Legalized selection DAG: %bb.0 'oeq:'
SelectionDAG has 15 nodes:
  t0: ch,glue = EntryToken
      t24: i8 = X86ISD::SETCC TargetConstant:i8<4>, t20
      t22: i8 = X86ISD::SETCC TargetConstant:i8<11>, t20
    t19: i8 = and t24, t22
  t10: ch,glue = CopyToReg t0, Register:i8 $al, t19
    t2: f32,ch = CopyFromReg t0, Register:f32 %0
    t4: f32,ch = CopyFromReg t0, Register:f32 %1
  t20: i32 = X86ISD::FCMP t2, t4
  t11: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>, Register:i8 $al, t10:1

If i understand correctly you are asking to remove t19, t22 and t24, is that correct?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#113567 added here. Can you review?

if (Subtarget.hasAVX10_2()) {
return getSETCC(
((cc0 == X86::COND_E) ? X86::COND_E : X86::COND_NE),
DAG.getNode(X86ISD::UCOMX, DL, MVT::i32, CMP00, CMP01), DL,
DAG);
}
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
Expand Down
23 changes: 23 additions & 0 deletions llvm/lib/Target/X86/X86InstrAVX10.td
Original file line number Diff line number Diff line change
Expand Up @@ -1541,6 +1541,24 @@ defm VFNMSUB132NEPBF16 : avx10_fma3p_132_bf16<0x9E, "vfnmsub132nepbf16", X86any_
//-------------------------------------------------
// AVX10 COMEF instructions
//-------------------------------------------------
multiclass avx10_com_ef<bits<8> Opc, RegisterClass RC, ValueType VT,
SDPatternOperator OpNode, string OpcodeStr,
X86MemOperand x86memop, PatFrag ld_frag,
Domain d, X86FoldableSchedWrite sched = WriteFComX>{
let ExeDomain = d, mayRaiseFPException = 1, isCodeGenOnly = 1 in {
def rr : AVX512<Opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (VT RC:$src1), RC:$src2))]>,
EVEX, EVEX_V128, Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in {
def rm : AVX512<Opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (VT RC:$src1), (ld_frag addr:$src2)))]>,
EVEX, EVEX_V128, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
}

multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
string OpcodeStr,
Domain d,
Expand All @@ -1564,6 +1582,11 @@ multiclass avx10_com_ef_int<bits<8> Opc, X86VectorVTInfo _, SDNode OpNode,
}

let Defs = [EFLAGS], Uses = [MXCSR], Predicates = [HasAVX10_2] in {

defm VUCOMXSSZ : avx10_com_ef<0x2e, FR32X, f32, X86ucomi512,
"vucomxss", f32mem, loadf32, SSEPackedSingle>,
TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
Comment on lines +1592 to +1594
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SD and SH?


defm VCOMXSDZ : avx10_com_ef_int<0x2f, v2f64x_info, X86comi512,
"vcomxsd", SSEPackedDouble>,
TB, XS, VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
Expand Down
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/X86/avx10_2-cmp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X64
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-512 | FileCheck %s --check-prefix=AVX10_2_X86
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just use X86/X64 for check prefixes


define i1 @oeq(float %x, float %y) {
; AVX10_2_X64-LABEL: oeq:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vucomxss %xmm1, %xmm0
; AVX10_2_X64-NEXT: sete %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: oeq:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
; AVX10_2_X86-NEXT: sete %al
; AVX10_2_X86-NEXT: retl
%1 = fcmp oeq float %x, %y
ret i1 %1
}

define i1 @une(float %x, float %y) {
; AVX10_2_X64-LABEL: une:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vucomxss %xmm1, %xmm0
; AVX10_2_X64-NEXT: setne %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: une:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomxss {{[0-9]+}}(%esp), %xmm0
; AVX10_2_X86-NEXT: setne %al
; AVX10_2_X86-NEXT: retl
%1 = fcmp une float %x, %y
ret i1 %1
}

define i1 @ogt(float %x, float %y) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ogt generates expected code without this patch. No need to test it.

; AVX10_2_X64-LABEL: ogt:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vucomiss %xmm1, %xmm0
; AVX10_2_X64-NEXT: seta %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: ogt:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomiss {{[0-9]+}}(%esp), %xmm0
; AVX10_2_X86-NEXT: seta %al
; AVX10_2_X86-NEXT: retl
%1 = fcmp ogt float %x, %y
ret i1 %1
}

define i1 @oeq_mem(ptr %xp, ptr %yp) {
; AVX10_2_X64-LABEL: oeq_mem:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X64-NEXT: vucomxss (%rsi), %xmm0
; AVX10_2_X64-NEXT: sete %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: oeq_mem:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomxss (%eax), %xmm0
; AVX10_2_X86-NEXT: sete %al
; AVX10_2_X86-NEXT: retl
%x = load float, ptr %xp
%y = load float, ptr %yp
%1 = fcmp oeq float %x, %y
ret i1 %1
}

define i1 @une_mem(ptr %xp, ptr %yp) {
; AVX10_2_X64-LABEL: une_mem:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X64-NEXT: vucomxss (%rsi), %xmm0
; AVX10_2_X64-NEXT: setne %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: une_mem:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomxss (%eax), %xmm0
; AVX10_2_X86-NEXT: setne %al
; AVX10_2_X86-NEXT: retl
%x = load float, ptr %xp
%y = load float, ptr %yp
%1 = fcmp une float %x, %y
ret i1 %1
}


define i1 @ogt_mem(ptr %xp, ptr %yp) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

; AVX10_2_X64-LABEL: ogt_mem:
; AVX10_2_X64: # %bb.0:
; AVX10_2_X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X64-NEXT: vucomiss (%rsi), %xmm0
; AVX10_2_X64-NEXT: seta %al
; AVX10_2_X64-NEXT: retq
;
; AVX10_2_X86-LABEL: ogt_mem:
; AVX10_2_X86: # %bb.0:
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX10_2_X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX10_2_X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX10_2_X86-NEXT: vucomiss (%eax), %xmm0
; AVX10_2_X86-NEXT: seta %al
; AVX10_2_X86-NEXT: retl
%x = load float, ptr %xp
%y = load float, ptr %yp
%1 = fcmp ogt float %x, %y
ret i1 %1
}
1 change: 1 addition & 0 deletions llvm/test/TableGen/x86-fold-tables.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1961,6 +1961,7 @@ static const X86FoldTableEntry Table1[] = {
{X86::VUCOMISSrr_Int, X86::VUCOMISSrm_Int, TB_NO_REVERSE},
{X86::VUCOMXSDZrr_Int, X86::VUCOMXSDZrm_Int, TB_NO_REVERSE},
{X86::VUCOMXSHZrr_Int, X86::VUCOMXSHZrm_Int, TB_NO_REVERSE},
{X86::VUCOMXSSZrr, X86::VUCOMXSSZrm, 0},
{X86::VUCOMXSSZrr_Int, X86::VUCOMXSSZrm_Int, TB_NO_REVERSE},
{X86::XOR16ri8_ND, X86::XOR16mi8_ND, 0},
{X86::XOR16ri8_NF_ND, X86::XOR16mi8_NF_ND, 0},
Expand Down
Loading