[X86] Remove SETB_C8r/SETB_C16r pseudo instructions. Use SETB_C32r and EXTRACT_SUBREG instead.

topperc · topperc · commit 600f2e1c4de5 · 2020-02-06T10:22:24.000-08:00
Only 32 and 64 bit SBB are dependency breaking instructons on some CPUs. The 8 and 16 bit forms have to preserve upper bits of the GPR. This patch removes the smaller forms and selects the wider form instead. I had to do this with custom code as the tblgen generated code glued the eflags copytoreg to the extract_subreg instead of to the SETB pseudo. Longer term I think we can remove X86ISD::SETCC_CARRY and use (X86ISD::SBB zero, zero). We'll want to keep the pseudo and select (X86ISD::SBB zero, zero) to either a MOV32r0+SBB for targets where there is no dependency break and SETB_C32/SETB_C64 for targets that have a dependency break. May want some way to avoid the MOV32r0 if the instruction that produced the carry flag happened to def a register that we can use for the dependency. I think the flag copy lowering should be using NEG instead of SUB to handle SETB. That would avoid the MOV32r0 there. Or maybe it should use a ADC with -1 to recreate the carry flag and keep the SETB? That would avoid a MOVZX on the input of the SUB. Differential Revision: https://reviews.llvm.org/D74024
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -639,8 +639,6 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           FlagsKilled = true;
 
           switch (MI.getOpcode()) {
-          case X86::SETB_C8r:
-          case X86::SETB_C16r:
           case X86::SETB_C32r:
           case X86::SETB_C64r:
             // Use custom lowering for arithmetic that is merely extending the
@@ -1057,24 +1055,9 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
 
   unsigned Sub;
   switch (SetBI.getOpcode()) {
-  case X86::SETB_C8r:
-    Sub = X86::SUB8rr;
-    break;
-
-  case X86::SETB_C16r:
-    Sub = X86::SUB16rr;
-    break;
-
-  case X86::SETB_C32r:
-    Sub = X86::SUB32rr;
-    break;
-
-  case X86::SETB_C64r:
-    Sub = X86::SUB64rr;
-    break;
-
-  default:
-    llvm_unreachable("Invalid SETB_C* opcode!");
+  default: llvm_unreachable("Invalid SETB_C* opcode!");
+  case X86::SETB_C32r: Sub = X86::SUB32rr; break;
+  case X86::SETB_C64r: Sub = X86::SUB64rr; break;
   }
   Register ResultReg = MRI->createVirtualRegister(&SetBRC);
   BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5270,6 +5270,35 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (foldLoadStoreIntoMemOperand(Node))
       return;
     break;
+
+  case X86ISD::SETCC_CARRY: {
+    // We have to do this manually because tblgen will put the eflags copy in
+    // the wrong place if we use an extract_subreg in the pattern.
+    MVT VT = Node->getSimpleValueType(0);
+    SDValue Chain = CurDAG->getEntryNode();
+
+    // Copy flags to the EFLAGS register and glue it to next node.
+    SDValue EFLAGS = CurDAG->getCopyToReg(Chain, dl, X86::EFLAGS,
+                                          Node->getOperand(1), SDValue());
+    Chain = EFLAGS;
+
+    // Create a 64-bit instruction if the result is 64-bits otherwise use the
+    // 32-bit version.
+    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+    SDValue Result = SDValue(
+        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+
+    // For less than 32-bits we need to extract from the 32-bit node.
+    if (VT == MVT::i8 || VT == MVT::i16) {
+      int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+      Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+    }
+
+    ReplaceUses(SDValue(Node, 0), Result);
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -308,18 +308,13 @@ def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
 def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>;
 
 // Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteADC],
+    hasSideEffects = 0 in {
 // FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 // However, Pat<> can't replicate the destination reg into the inputs of the
 // result.
-def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
-                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
-                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", []>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", []>;
 } // isCodeGenOnly
 
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4085,10 +4085,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::MOV32ImmSExti8:
   case X86::MOV64ImmSExti8:
     return ExpandMOVImmSExti8(MIB, *this, Subtarget);
-  case X86::SETB_C8r:
-    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
-  case X86::SETB_C16r:
-    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
   case X86::SETB_C32r:
     return Expand2AddrUndef(MIB, get(X86::SBB32rr));
   case X86::SETB_C64r:
diff --git a/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/llvm/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -12,7 +12,7 @@ define i32 @main() nounwind {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpq {{.*}}(%rip), %rax
-; CHECK-NEXT:    sbbb %al, %al
+; CHECK-NEXT:    sbbl %eax, %eax
 ; CHECK-NEXT:    testb $-106, %al
 ; CHECK-NEXT:    jle .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
diff --git a/llvm/test/CodeGen/X86/flags-copy-lowering.mir b/llvm/test/CodeGen/X86/flags-copy-lowering.mir
@@ -541,37 +541,17 @@ body:             |
     ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
 
     $eflags = COPY %3
-    %4:gr8 = SETB_C8r implicit-def $eflags, implicit $eflags
-    MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %4
-  ; CHECK-NOT:     $eflags =
-  ; CHECK:         %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
-  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr8 = COPY %[[ZERO]].sub_8bit
-  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr8 = SUB8rr %[[ZERO_SUBREG]], %[[CF_REG]]
-  ; CHECK-NEXT:    MOV8mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
-
-    $eflags = COPY %3
-    %5:gr16 = SETB_C16r implicit-def $eflags, implicit $eflags
-    MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %5
-  ; CHECK-NOT:     $eflags =
-  ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
-  ; CHECK-NEXT:    %[[CF_TRUNC:[^:]*]]:gr16 = COPY %[[CF_EXT]].sub_16bit
-  ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
-  ; CHECK-NEXT:    %[[ZERO_SUBREG:[^:]*]]:gr16 = COPY %[[ZERO]].sub_16bit
-  ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr16 = SUB16rr %[[ZERO_SUBREG]], %[[CF_TRUNC]]
-  ; CHECK-NEXT:    MOV16mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
-
-    $eflags = COPY %3
-    %6:gr32 = SETB_C32r implicit-def $eflags, implicit $eflags
-    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %6
+    %4:gr32 = SETB_C32r implicit-def $eflags, implicit $eflags
+    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %4
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[CF_EXT:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
   ; CHECK-NEXT:    %[[ZERO:[^:]*]]:gr32 = MOV32r0 implicit-def $eflags
   ; CHECK-NEXT:    %[[REPLACEMENT:[^:]*]]:gr32 = SUB32rr %[[ZERO]], %[[CF_EXT]]
   ; CHECK-NEXT:    MOV32mr $rsp, 1, $noreg, -16, $noreg, killed %[[REPLACEMENT]]
 
     $eflags = COPY %3
-    %7:gr64 = SETB_C64r implicit-def $eflags, implicit $eflags
-    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
+    %5:gr64 = SETB_C64r implicit-def $eflags, implicit $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
   ; CHECK-NOT:     $eflags =
   ; CHECK:         %[[CF_EXT1:[^:]*]]:gr32 = MOVZX32rr8 %[[CF_REG]]
   ; CHECK-NEXT:    %[[CF_EXT2:[^:]*]]:gr64 = SUBREG_TO_REG 0, %[[CF_EXT1]], %subreg.sub_32bit
diff --git a/llvm/test/CodeGen/X86/sbb.ll b/llvm/test/CodeGen/X86/sbb.ll
@@ -9,7 +9,8 @@ define i8 @i8_select_0_or_neg1(i8 %x) {
 ; CHECK-LABEL: i8_select_0_or_neg1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    negb %dil
-; CHECK-NEXT:    sbbb %al, %al
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i8 %x, 0
   %sel = select i1 %cmp, i8 0, i8 -1
@@ -22,7 +23,8 @@ define i16 @i16_select_0_or_neg1_as_math(i16 %x) {
 ; CHECK-LABEL: i16_select_0_or_neg1_as_math:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    negw %di
-; CHECK-NEXT:    sbbw %ax, %ax
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i16 %x, 0
   %ext = zext i1 %cmp to i16
@@ -90,7 +92,8 @@ define i16 @i16_select_neg1_or_0_commuted(i16 %x) {
 ; CHECK-LABEL: i16_select_neg1_or_0_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw $1, %di
-; CHECK-NEXT:    sbbw %ax, %ax
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i16 %x, 0
   %sel = select i1 %cmp, i16 0, i16 -1
@@ -103,7 +106,8 @@ define i8 @i8_select_neg1_or_0_commuted_as_math(i8 %x) {
 ; CHECK-LABEL: i8_select_neg1_or_0_commuted_as_math:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpb $1, %dil
-; CHECK-NEXT:    sbbb %al, %al
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i8 %x, 0
   %ext = zext i1 %cmp to i8
@@ -205,7 +209,8 @@ define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: ult_select_neg1_or_0_sub:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cmpw %di, %si
-; CHECK-NEXT:    sbbw %ax, %ax
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i16 %y, %x
   %zext = zext i1 %cmp to i16
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -666,7 +666,8 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    negl %eax
-; SSE-NEXT:    sbbw %ax, %ax
+; SSE-NEXT:    sbbl %eax, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i16_legal_sext:
@@ -678,7 +679,8 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    negl %eax
-; AVX1-NEXT:    sbbw %ax, %ax
+; AVX1-NEXT:    sbbl %eax, %eax
+; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -689,7 +691,8 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    sbbw %ax, %ax
+; AVX2-NEXT:    sbbl %eax, %eax
+; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -731,15 +734,17 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    negl %eax
-; SSE-NEXT:    sbbb %al, %al
+; SSE-NEXT:    sbbl %eax, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16i8_sext:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX-NEXT:    negl %eax
-; AVX-NEXT:    sbbb %al, %al
+; AVX-NEXT:    sbbl %eax, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v16i8_sext:
@@ -778,7 +783,8 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    negl %eax
-; SSE-NEXT:    sbbb %al, %al
+; SSE-NEXT:    sbbl %eax, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i8_sext:
@@ -790,7 +796,8 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    negl %eax
-; AVX1-NEXT:    sbbb %al, %al
+; AVX1-NEXT:    sbbl %eax, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -799,7 +806,8 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmovmskb %ymm0, %eax
 ; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    sbbb %al, %al
+; AVX2-NEXT:    sbbl %eax, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;