Skip to content

Commit 68e1a8d

Browse files
committed
[X86] Defer the creation of LCMPXCHG16B_SAVE_RBX until finalize-isel
We need to use LCMPXCHG16B_SAVE_RBX if RBX/EBX is being used as the frame pointer. We previously checked for this during type legalization, but that's too early to know for sure if the base pointer is needed. This patch adds a new pseudo instruction to emit from isel that uses a virtual register for the RBX input. Then we use the custom inserter hook to emit LCMPXCHG16B if RBX isn't needed as a base pointer or LCMPXCHG16B_SAVE_RBX if it is. Fixes PR42064. Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D88808
1 parent c102488 commit 68e1a8d

File tree

6 files changed

+171
-63
lines changed

6 files changed

+171
-63
lines changed

llvm/lib/Target/X86/X86ExpandPseudo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,9 +338,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
338338
// Perform the following transformation.
339339
// SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
340340
// =>
341-
// [E|R]BX = InArg
341+
// RBX = InArg
342342
// actualcmpxchg Addr
343-
// [E|R]BX = SaveRbx
343+
// RBX = SaveRbx
344344
const MachineOperand &InArg = MBBI->getOperand(6);
345345
Register SaveRbx = MBBI->getOperand(7).getReg();
346346

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -30481,38 +30481,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3048130481
swapInH =
3048230482
DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
3048330483
swapInH, cpInH.getValue(1));
30484-
// If the current function needs the base pointer, RBX,
30485-
// we shouldn't use cmpxchg directly.
30486-
// Indeed the lowering of that instruction will clobber
30487-
// that register and since RBX will be a reserved register
30488-
// the register allocator will not make sure its value will
30489-
// be properly saved and restored around this live-range.
30490-
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
30484+
30485+
// In 64-bit mode we might need the base pointer in RBX, but we can't know
30486+
// until later. So we keep the RBX input in a vreg and use a custom
30487+
// inserter.
30488+
// Since RBX will be a reserved register the register allocator will not
30489+
// make sure its value will be properly saved and restored around this
30490+
// live-range.
3049130491
SDValue Result;
3049230492
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30493-
Register BasePtr = TRI->getBaseRegister();
3049430493
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
30495-
if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
30496-
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
30497-
assert(Regs64bit && "RBX/EBX base pointer only expected for i128 CAS");
30498-
SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
30499-
X86::RBX,
30500-
HalfT, swapInH.getValue(1));
30501-
SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
30502-
RBXSave,
30503-
/*Glue*/ RBXSave.getValue(2)};
30504-
Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_SAVE_RBX_DAG, dl, Tys,
30505-
Ops, T, MMO);
30494+
if (Regs64bit) {
30495+
SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
30496+
swapInH.getValue(1)};
30497+
Result =
30498+
DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
3050630499
} else {
30507-
unsigned Opcode =
30508-
Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
30509-
swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
30510-
Regs64bit ? X86::RBX : X86::EBX, swapInL,
30500+
swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
3051130501
swapInH.getValue(1));
3051230502
SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
3051330503
swapInL.getValue(1)};
30514-
Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
30504+
Result =
30505+
DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
3051530506
}
30507+
3051630508
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
3051730509
Regs64bit ? X86::RAX : X86::EAX,
3051830510
HalfT, Result.getValue(1));
@@ -30811,7 +30803,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3081130803
NODE_NAME_CASE(LCMPXCHG_DAG)
3081230804
NODE_NAME_CASE(LCMPXCHG8_DAG)
3081330805
NODE_NAME_CASE(LCMPXCHG16_DAG)
30814-
NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
3081530806
NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
3081630807
NODE_NAME_CASE(LADD)
3081730808
NODE_NAME_CASE(LSUB)
@@ -33770,11 +33761,31 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
3377033761

3377133762
return BB;
3377233763
}
33773-
case X86::LCMPXCHG16B:
33774-
return BB;
33775-
case X86::LCMPXCHG16B_SAVE_RBX: {
33776-
if (!BB->isLiveIn(X86::RBX))
33777-
BB->addLiveIn(X86::RBX);
33764+
case X86::LCMPXCHG16B_NO_RBX: {
33765+
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
33766+
Register BasePtr = TRI->getBaseRegister();
33767+
X86AddressMode AM = getAddressFromInstr(&MI, 0);
33768+
if (TRI->hasBasePointer(*MF) &&
33769+
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
33770+
if (!BB->isLiveIn(BasePtr))
33771+
BB->addLiveIn(BasePtr);
33772+
// Save RBX into a virtual register.
33773+
Register SaveRBX =
33774+
MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33775+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
33776+
.addReg(X86::RBX);
33777+
Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
33778+
addFullAddress(
33779+
BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst), AM)
33780+
.add(MI.getOperand(X86::AddrNumOperands))
33781+
.addReg(SaveRBX);
33782+
} else {
33783+
// Simple case, just copy the virtual register to RBX.
33784+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
33785+
.add(MI.getOperand(X86::AddrNumOperands));
33786+
addFullAddress(BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)), AM);
33787+
}
33788+
MI.eraseFromParent();
3377833789
return BB;
3377933790
}
3378033791
case X86::MWAITX: {

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,6 @@ namespace llvm {
756756
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
757757
LCMPXCHG8_DAG,
758758
LCMPXCHG16_DAG,
759-
LCMPXCHG8_SAVE_EBX_DAG,
760759
LCMPXCHG16_SAVE_RBX_DAG,
761760

762761
/// LOCK-prefixed arithmetic read-modify-write instructions.

llvm/lib/Target/X86/X86InstrCompiler.td

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -809,15 +809,6 @@ let Predicates = [UseIncDec] in {
809809
}
810810

811811
// Atomic compare and swap.
812-
multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
813-
SDPatternOperator frag, X86MemOperand x86memop> {
814-
let isCodeGenOnly = 1, usesCustomInserter = 1 in {
815-
def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
816-
!strconcat(mnemonic, "\t$ptr"),
817-
[(frag addr:$ptr)]>, TB, LOCK;
818-
}
819-
}
820-
821812
multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
822813
string mnemonic, SDPatternOperator frag> {
823814
let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
@@ -841,14 +832,19 @@ let isCodeGenOnly = 1, SchedRW = [WriteCMPXCHGRMW] in {
841832
}
842833

843834
let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
844-
Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW] in {
845-
defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
835+
Predicates = [HasCmpxchg8b], SchedRW = [WriteCMPXCHGRMW],
836+
isCodeGenOnly = 1, usesCustomInserter = 1 in {
837+
def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
838+
"cmpxchg8b\t$ptr",
839+
[(X86cas8 addr:$ptr)]>, TB, LOCK;
846840
}
847841

848842
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
849-
Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW] in {
850-
defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
851-
X86cas16, i128mem>, REX_W;
843+
Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
844+
isCodeGenOnly = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
845+
def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
846+
"cmpxchg16b\t$ptr",
847+
[]>, TB, LOCK;
852848
}
853849

854850
// This pseudo must be used when the frame uses RBX as
@@ -872,14 +868,24 @@ defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
872868
// the value of RBX.
873869
let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX],
874870
Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
875-
isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst",
876-
usesCustomInserter = 1 in {
871+
isCodeGenOnly = 1, isPseudo = 1,
872+
mayLoad = 1, mayStore = 1, hasSideEffects = 0,
873+
Constraints = "$rbx_save = $dst" in {
877874
def LCMPXCHG16B_SAVE_RBX :
878875
I<0, Pseudo, (outs GR64:$dst),
879-
(ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save),
880-
!strconcat("cmpxchg16b", "\t$ptr"),
881-
[(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input,
882-
GR64:$rbx_save))]>;
876+
(ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), "", []>;
877+
}
878+
879+
// Pseudo instruction that doesn't read/write RBX. Will be turned into either
880+
// LCMPXCHG16B_SAVE_RBX or LCMPXCHG16B via a custom inserter.
881+
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RCX, RDX],
882+
Predicates = [HasCmpxchg16b,In64BitMode], SchedRW = [WriteCMPXCHGRMW],
883+
isCodeGenOnly = 1, isPseudo = 1,
884+
mayLoad = 1, mayStore = 1, hasSideEffects = 0,
885+
usesCustomInserter = 1 in {
886+
def LCMPXCHG16B_NO_RBX :
887+
I<0, Pseudo, (outs), (ins i128mem:$ptr, GR64:$rbx_input), "",
888+
[(X86cas16 addr:$ptr, GR64:$rbx_input)]>;
883889
}
884890

885891
// This pseudo must be used when the frame uses RBX/EBX as

llvm/lib/Target/X86/X86InstrInfo.td

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,8 @@ def SDTX86wrpkru : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
6969

7070
def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
7171
SDTCisVT<2, i8>]>;
72-
def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
73-
def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3,
74-
[SDTCisVT<0, i64>, SDTCisPtrTy<1>,
75-
SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
72+
def SDTX86cas8pair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
73+
def SDTX86cas16pair : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i64>]>;
7674

7775
def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
7876
SDTCisPtrTy<1>,
@@ -171,16 +169,12 @@ def X86wrpkru : SDNode<"X86ISD::WRPKRU", SDTX86wrpkru,
171169
def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
172170
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
173171
SDNPMayLoad, SDNPMemOperand]>;
174-
def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
172+
def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8pair,
175173
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
176174
SDNPMayLoad, SDNPMemOperand]>;
177-
def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
175+
def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86cas16pair,
178176
[SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
179177
SDNPMayLoad, SDNPMemOperand]>;
180-
def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG",
181-
SDTX86caspairSaveRbx16,
182-
[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
183-
SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
184178

185179
def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
186180
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

llvm/test/CodeGen/X86/pr42064.ll

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc19.11.0 -mattr=+avx,+cx16 | FileCheck %s
3+
4+
%struct.TestStruct = type { %union.Int128 }
5+
%union.Int128 = type { i128 }
6+
%struct.SomeArrays = type { %struct.SillyArray, %struct.SillyArray, %struct.SillyArray }
7+
%struct.SillyArray = type { i8*, i32, i32 }
8+
9+
declare void @llvm.lifetime.start.p0i8(i64, i8*)
10+
11+
define void @foo(%struct.TestStruct* %arg) align 2 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
12+
; Check that %rbx is being used for a frame pointer
13+
; CHECK-LABEL: foo:
14+
; CHECK: movq %rsp, %rbx
15+
16+
; Check that %rbx is saved and restored around both lock cmpxchg16b.
17+
; CHECK: movq %rbx, %r9
18+
; CHECK-NEXT: movabsq $1393743441367457520, %rcx # imm = 0x135792468ABCDEF0
19+
; CHECK-NEXT: movq %rcx, %rax
20+
; CHECK-NEXT: movq %rcx, %rdx
21+
; CHECK-NEXT: movq %rcx, %rbx
22+
; CHECK-NEXT: lock cmpxchg16b (%r8)
23+
; CHECK-NEXT: movq %r9, %rbx
24+
25+
; CHECK: movq %rbx, %r9
26+
; CHECK-NEXT: movq %rcx, %rax
27+
; CHECK-NEXT: movq %rcx, %rdx
28+
; CHECK-NEXT: movq %rcx, %rbx
29+
; CHECK-NEXT: lock cmpxchg16b (%r8)
30+
; CHECK-NEXT: movq %r9, %rbx
31+
bb:
32+
%i = alloca %struct.SomeArrays, align 8
33+
%i1 = alloca %struct.SomeArrays, align 8
34+
%i2 = getelementptr inbounds %struct.TestStruct, %struct.TestStruct* %arg, i64 0, i32 0, i32 0
35+
%i3 = cmpxchg i128* %i2, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst
36+
%i4 = extractvalue { i128, i1 } %i3, 0
37+
%i5 = trunc i128 %i4 to i64
38+
%i6 = icmp eq i64 %i5, 0
39+
br i1 %i6, label %bb9, label %bb7
40+
41+
bb7: ; preds = %bb
42+
%i8 = cmpxchg i128* %i2, i128 25710028567316702934644703134494809840, i128 25710028567316702934644703134494809840 seq_cst seq_cst
43+
br label %bb9
44+
45+
bb9: ; preds = %bb7, %bb
46+
%i10 = bitcast %struct.SomeArrays* %i to i8*
47+
call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %i10)
48+
call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %i10, i8 0, i64 48, i1 false)
49+
%i11 = bitcast %struct.SomeArrays* %i1 to i8*
50+
call void @llvm.lifetime.start.p0i8(i64 48, i8* nonnull %i11)
51+
%i12 = bitcast %struct.SomeArrays* %i1 to i8*
52+
call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %i12, i8 0, i64 48, i1 false)
53+
%i13 = invoke nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays* nonnull %i, %struct.SomeArrays* nonnull align 8 dereferenceable(48) %i1)
54+
to label %bb14 unwind label %bb45
55+
56+
bb14: ; preds = %bb9
57+
call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %i10)
58+
ret void
59+
60+
bb45: ; preds = %bb9
61+
%i46 = cleanuppad within none []
62+
%i47 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %i1, i64 0, i32 2, i32 0
63+
%i48 = load i8*, i8** %i47, align 8
64+
invoke void @"?free@@YAXPEAX@Z"(i8* %i48) [ "funclet"(token %i46) ]
65+
to label %bb51 unwind label %bb49
66+
67+
bb49: ; preds = %bb45
68+
%i50 = cleanuppad within %i46 []
69+
call void @__std_terminate() [ "funclet"(token %i50) ]
70+
unreachable
71+
72+
bb51: ; preds = %bb45
73+
%i52 = getelementptr inbounds %struct.SomeArrays, %struct.SomeArrays* %i1, i64 0, i32 1, i32 0
74+
%i53 = load i8*, i8** %i52, align 8
75+
invoke void @"?free@@YAXPEAX@Z"(i8* %i53) [ "funclet"(token %i46) ]
76+
to label %bb56 unwind label %bb54
77+
78+
bb54: ; preds = %bb51
79+
%i55 = cleanuppad within %i46 []
80+
call void @__std_terminate() [ "funclet"(token %i55) ]
81+
unreachable
82+
83+
bb56: ; preds = %bb51
84+
call void @llvm.lifetime.end.p0i8(i64 48, i8* nonnull %i10)
85+
cleanupret from %i46 unwind to caller
86+
}
87+
88+
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
89+
90+
declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1)
91+
92+
declare dso_local i32 @__CxxFrameHandler3(...)
93+
94+
declare nonnull align 8 dereferenceable(48) %struct.SomeArrays* @"??4SomeArrays@@QEAAAEAU0@$$QEAU0@@Z"(%struct.SomeArrays*, %struct.SomeArrays* nonnull align 8 dereferenceable(48)) align 2
95+
96+
declare void @"?free@@YAXPEAX@Z"(i8*)
97+
98+
declare void @__std_terminate()

0 commit comments

Comments
 (0)