Skip to content

Commit 4780bd9

Browse files
LuoYuankeYuanke Luo
andauthored
[X86] Fix spill issue for fr16 (#155225)
When avx512fp16 is not available, we use MOVSS to spill fr16/fr16x register. However The MOVSSmr require fr32 register class and MOVSSrm require vr128 register class which cause bad instruction detected by machine verifier. To fix the issue this patch is to create a pseudo instruction MOVSHP for fr16 register spilling. MOVSHP is expanded to MOVSS or VMOVSSZ depending on the register number. --------- Co-authored-by: Yuanke Luo <[email protected]>
1 parent 8849750 commit 4780bd9

File tree

4 files changed

+134
-7
lines changed

4 files changed

+134
-7
lines changed

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4399,13 +4399,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
43994399
if (STI.hasFP16())
44004400
return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
44014401
if (Load)
4402-
return STI.hasAVX512() ? X86::VMOVSSZrm
4403-
: STI.hasAVX() ? X86::VMOVSSrm
4404-
: X86::MOVSSrm;
4405-
else
4406-
return STI.hasAVX512() ? X86::VMOVSSZmr
4407-
: STI.hasAVX() ? X86::VMOVSSmr
4408-
: X86::MOVSSmr;
4402+
return X86::MOVSHPrm;
4403+
return X86::MOVSHPmr;
44094404
}
44104405

44114406
static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -6131,6 +6126,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
61316126
return true;
61326127
}
61336128

6129+
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
6130+
const TargetInstrInfo &TII, bool HasAVX) {
6131+
unsigned NewOpc;
6132+
if (MI.getOpcode() == X86::MOVSHPrm) {
6133+
NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6134+
Register Reg = MI.getOperand(0).getReg();
6135+
if (Reg > X86::XMM15)
6136+
NewOpc = X86::VMOVSSZrm;
6137+
} else {
6138+
NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6139+
Register Reg = MI.getOperand(5).getReg();
6140+
if (Reg > X86::XMM15)
6141+
NewOpc = X86::VMOVSSZmr;
6142+
}
6143+
6144+
MIB->setDesc(TII.get(NewOpc));
6145+
return true;
6146+
}
6147+
61346148
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
61356149
bool HasAVX = Subtarget.hasAVX();
61366150
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6217,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
62036217
}
62046218
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
62056219
}
6220+
case X86::MOVSHPmr:
6221+
case X86::MOVSHPrm:
6222+
return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
62066223
case X86::V_SETALLONES:
62076224
return Expand2AddrUndef(MIB,
62086225
get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
267267
}
268268
}
269269

270+
// pseudo instruction for fp16 spilling.
271+
let isPseudo = 1, Predicates = [HasSSE2] in {
272+
let mayStore = 1 in
273+
def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
274+
[], SSEPackedSingle>,
275+
Sched<[WriteFStore]>;
276+
let mayLoad = 1 in
277+
def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
278+
[], SSEPackedSingle>,
279+
Sched<[WriteFLoad]>;
280+
}
281+
270282
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
271283
SSEPackedSingle, UseSSE1>, TB, XS;
272284
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=x86_64-unknown -start-before=twoaddressinstruction -stop-after=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
3+
4+
...
5+
---
6+
name: test
7+
alignment: 16
8+
tracksRegLiveness: true
9+
debugInstrRef: true
10+
registers:
11+
liveins:
12+
- { reg: '$xmm0', virtual-reg: '%0' }
13+
frameInfo:
14+
maxAlignment: 1
15+
hasCalls: true
16+
machineFunctionInfo: {}
17+
body: |
18+
bb.0:
19+
liveins: $xmm0
20+
21+
; CHECK-LABEL: name: test
22+
; CHECK: liveins: $xmm0
23+
; CHECK-NEXT: {{ $}}
24+
; CHECK-NEXT: MOVSSmr $rsp, 1, $noreg, -4, $noreg, $xmm0 :: (store (s32) into %stack.0, align 2)
25+
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
26+
; CHECK-NEXT: renamable $xmm0 = MOVSSrm $rsp, 1, $noreg, -4, $noreg :: (load (s32) from %stack.0, align 2)
27+
; CHECK-NEXT: FNOP implicit-def $fpsw, implicit killed renamable $xmm0
28+
; CHECK-NEXT: RET 0
29+
%0:fr16 = COPY killed $xmm0
30+
INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
31+
FNOP implicit-def $fpsw, implicit %0:fr16
32+
RET 0
33+
34+
...
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=SSE2
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
5+
6+
define half @test(float %f, ptr %p) nounwind {
7+
; SSE2-LABEL: test:
8+
; SSE2: # %bb.0:
9+
; SSE2-NEXT: pushq %rbx
10+
; SSE2-NEXT: subq $16, %rsp
11+
; SSE2-NEXT: movq %rdi, %rbx
12+
; SSE2-NEXT: callq __truncsfhf2@PLT
13+
; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14+
; SSE2-NEXT: callq __extendhfsf2@PLT
15+
; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
16+
; SSE2-NEXT: #APP
17+
; SSE2-NEXT: #NO_APP
18+
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
19+
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
20+
; SSE2-NEXT: movss %xmm0, (%rbx)
21+
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
22+
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
23+
; SSE2-NEXT: addq $16, %rsp
24+
; SSE2-NEXT: popq %rbx
25+
; SSE2-NEXT: retq
26+
;
27+
; AVX-LABEL: test:
28+
; AVX: # %bb.0:
29+
; AVX-NEXT: pushq %rbx
30+
; AVX-NEXT: subq $16, %rsp
31+
; AVX-NEXT: movq %rdi, %rbx
32+
; AVX-NEXT: callq __truncsfhf2@PLT
33+
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
34+
; AVX-NEXT: callq __extendhfsf2@PLT
35+
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
36+
; AVX-NEXT: #APP
37+
; AVX-NEXT: #NO_APP
38+
; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
39+
; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
40+
; AVX-NEXT: vmovss %xmm0, (%rbx)
41+
; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
42+
; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero
43+
; AVX-NEXT: addq $16, %rsp
44+
; AVX-NEXT: popq %rbx
45+
; AVX-NEXT: retq
46+
;
47+
; AVX512-LABEL: test:
48+
; AVX512: # %bb.0:
49+
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
50+
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
51+
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
52+
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
53+
; AVX512-NEXT: #APP
54+
; AVX512-NEXT: #NO_APP
55+
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
56+
; AVX512-NEXT: vmovss %xmm0, (%rdi)
57+
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
58+
; AVX512-NEXT: retq
59+
%t = fptrunc float %f to half
60+
%t2 = fpext half %t to float
61+
tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
62+
store float %t2, ptr %p
63+
ret half %t
64+
}

0 commit comments

Comments
 (0)