Skip to content

Commit 0c6f983

Browse files
[AMX][PreTileConfig] Ensure that PLDTILECFGV instruction is sinked closer to tile use instruction. (#155673)
According AMX ABI, tile registers (including config) are volatile hence requiring caller to save/restore config register. This is done in X86's FastPreTileConfig pass. Currently the PLDTILECFGV instruction is emitted immediately after the call which can be problematic if call returns a value in say rax register and AMX tile is configured using the same register. This PR addresses this issue by ensuring that PLDTILECFGV is sinked closer to first instruction using a tile after the call.
1 parent 899ee37 commit 0c6f983

File tree

3 files changed

+164
-4
lines changed

3 files changed

+164
-4
lines changed

llvm/lib/Target/X86/X86FastPreTileConfig.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
564564
MachineBasicBlock::iterator I;
565565
if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
566566
I = ++LastShapeMI->getIterator();
567-
else
568-
I = ++MI.getIterator();
567+
else {
568+
// Call can overwrite registers like rax, ensure the tile config
569+
// instruction is sinked closer to first instruction that uses tile.
570+
auto UseIt = MI.getIterator();
571+
while (UseIt != MBB.end()) {
572+
if (HasTileOperand(MRI, *UseIt))
573+
break;
574+
++UseIt;
575+
}
576+
I = UseIt;
577+
}
569578
Config(*I);
570579
HasUnconfigTile = false;
571580
continue;

llvm/test/CodeGen/X86/AMX/amx-across-func.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,12 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
139139
; O0-NEXT: callq foo
140140
; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
141141
; O0-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
142+
; O0-NEXT: movl $32, %esi
143+
; O0-NEXT: movl $buf+2048, %edx
142144
; O0-NEXT: # implicit-def: $al
143145
; O0-NEXT: movb %al, {{[0-9]+}}(%rsp)
144146
; O0-NEXT: movw %cx, {{[0-9]+}}(%rsp)
145147
; O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
146-
; O0-NEXT: movl $32, %esi
147-
; O0-NEXT: movl $buf+2048, %edx
148148
; O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
149149
; O0-NEXT: movl $64, %esi
150150
; O0-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass="fastpretileconfig,regallocfast,fasttileconfig" -o - %s | FileCheck %s
3+
4+
# Test to verify that ldtilecfg instructions are sinked closer to tile defining
5+
# instructions after a call. This ensures call does not overwrite values in
6+
# registers being used for configuring the AMX tile.
7+
8+
...
9+
---
10+
name: test_api
11+
alignment: 16
12+
tracksRegLiveness: true
13+
registers:
14+
- { id: 0, class: gr64, preferred-register: '', flags: [ ] }
15+
- { id: 1, class: gr64, preferred-register: '', flags: [ ] }
16+
- { id: 2, class: gr64, preferred-register: '', flags: [ ] }
17+
- { id: 3, class: gr64, preferred-register: '', flags: [ ] }
18+
- { id: 4, class: tile, preferred-register: '', flags: [ ] }
19+
- { id: 5, class: gr64_nosp, preferred-register: '', flags: [ ] }
20+
- { id: 6, class: gr64, preferred-register: '', flags: [ ] }
21+
- { id: 9, class: gr64_nosp, preferred-register: '', flags: [ ] }
22+
- { id: 10, class: gr64, preferred-register: '', flags: [ ] }
23+
- { id: 13, class: tile, preferred-register: '', flags: [ ] }
24+
- { id: 14, class: gr64_nosp, preferred-register: '', flags: [ ] }
25+
- { id: 15, class: gr64, preferred-register: '', flags: [ ] }
26+
- { id: 18, class: gr64, preferred-register: '', flags: [ ] }
27+
- { id: 19, class: gr64_nosp, preferred-register: '', flags: [ ] }
28+
- { id: 22, class: gr64, preferred-register: '', flags: [ ] }
29+
- { id: 23, class: gr64, preferred-register: '', flags: [ ] }
30+
- { id: 24, class: gr64, preferred-register: '', flags: [ ] }
31+
- { id: 25, class: tile, preferred-register: '', flags: [ ] }
32+
- { id: 26, class: gr64_nosp, preferred-register: '', flags: [ ] }
33+
- { id: 29, class: gr64_nosp, preferred-register: '', flags: [ ] }
34+
- { id: 30, class: gr64, preferred-register: '', flags: [ ] }
35+
- { id: 33, class: tile, preferred-register: '', flags: [ ] }
36+
- { id: 34, class: gr64_nosp, preferred-register: '', flags: [ ] }
37+
- { id: 35, class: gr64, preferred-register: '', flags: [ ] }
38+
- { id: 38, class: gr64_nosp, preferred-register: '', flags: [ ] }
39+
- { id: 39, class: gr64, preferred-register: '', flags: [ ] }
40+
- { id: 40, class: gr16, preferred-register: '', flags: [ ] }
41+
- { id: 41, class: gr16, preferred-register: '', flags: [ ] }
42+
liveins:
43+
- { reg: '$rdi', virtual-reg: '%0' }
44+
- { reg: '$rsi', virtual-reg: '%2' }
45+
frameInfo:
46+
maxAlignment: 1024
47+
stack:
48+
- { id: 0, size: 1024, alignment: 1024 }
49+
- { id: 1, size: 1024, alignment: 1024 }
50+
- { id: 2, size: 32, alignment: 32 }
51+
- { id: 3, size: 32, alignment: 32 }
52+
- { id: 4, size: 8, alignment: 8 }
53+
machineFunctionInfo:
54+
amxProgModel: ManagedRA
55+
body: |
56+
bb.0.entry:
57+
liveins: $rdi, $rsi
58+
59+
; CHECK-LABEL: name: test_api
60+
; CHECK: liveins: $rdi, $rsi
61+
; CHECK-NEXT: {{ $}}
62+
; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
63+
; CHECK-NEXT: VMOVUPSZmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.5, align 4)
64+
; CHECK-NEXT: MOV8mi %stack.5, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.5, align 4)
65+
; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.8)
66+
; CHECK-NEXT: renamable $rsi = MOV32ri64 16
67+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.2, 1, $noreg, 0, $noreg
68+
; CHECK-NEXT: renamable $cx = MOV16ri 16
69+
; CHECK-NEXT: MOV16mr %stack.7, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.7)
70+
; CHECK-NEXT: renamable $ax = MOV16ri 2
71+
; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $ax :: (store (s16) into %stack.6)
72+
; CHECK-NEXT: $al = IMPLICIT_DEF
73+
; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
74+
; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
75+
; CHECK-NEXT: $al = IMPLICIT_DEF
76+
; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
77+
; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
78+
; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
79+
; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
80+
; CHECK-NEXT: renamable $rsi = MOV32ri64 64
81+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
82+
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
83+
; CHECK-NEXT: renamable $rsi = MOV32ri64 64
84+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
85+
; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
86+
; CHECK-NEXT: renamable $rdx = MOV32ri64 16
87+
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdi, 1, killed renamable $rdx, 0, $noreg, killed renamable $tmm0
88+
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
89+
; CHECK-NEXT: CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
90+
; CHECK-NEXT: $rsi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
91+
; CHECK-NEXT: $cx = MOV16rm %stack.7, 1, $noreg, 0, $noreg :: (load (s16) from %stack.7)
92+
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
93+
; CHECK-NEXT: renamable $rdx = COPY $rax
94+
; CHECK-NEXT: $ax = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
95+
; CHECK-NEXT: MOV64mr killed renamable $rsi, 1, $noreg, 0, $noreg, killed renamable $rdx
96+
; CHECK-NEXT: renamable $rdx = MOV64rm %stack.4, 1, $noreg, 0, $noreg
97+
; CHECK-NEXT: renamable $rsi = MOV32ri64 16
98+
; CHECK-NEXT: $al = IMPLICIT_DEF
99+
; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
100+
; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
101+
; CHECK-NEXT: $al = IMPLICIT_DEF
102+
; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
103+
; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
104+
; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
105+
; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
106+
; CHECK-NEXT: renamable $rsi = MOV32ri64 64
107+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
108+
; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
109+
; CHECK-NEXT: renamable $rsi = MOV32ri64 64
110+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
111+
; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
112+
; CHECK-NEXT: renamable $rsi = MOV32ri64 16
113+
; CHECK-NEXT: renamable $rdx = LEA64r %stack.4, 1, $noreg, 0, $noreg
114+
; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
115+
; CHECK-NEXT: RET64
116+
%2:gr64 = COPY $rsi
117+
%0:gr64 = COPY $rdi
118+
%1:gr64 = COPY killed %0
119+
%3:gr64 = COPY killed %2
120+
%38:gr64_nosp = MOV32ri64 16
121+
%39:gr64 = LEA64r %stack.2, 1, $noreg, 0, $noreg
122+
%40:gr16 = MOV16ri 16
123+
%41:gr16 = MOV16ri 2
124+
%33:tile = PTILELOADDV %41:gr16, %40:gr16, killed %39, 1, killed %38, 0, $noreg
125+
%34:gr64_nosp = MOV32ri64 64
126+
%35:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
127+
PTILESTOREDV %41:gr16, %40:gr16, killed %35, 1, killed %34, 0, $noreg, %33
128+
%29:gr64_nosp = MOV32ri64 64
129+
%30:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
130+
%25:tile = PTILELOADDV %41:gr16, %40:gr16, killed %30, 1, killed %29, 0, $noreg
131+
%26:gr64_nosp = MOV32ri64 16
132+
PTILESTOREDV %41:gr16, %40:gr16, %1, 1, killed %26, 0, $noreg, %25
133+
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
134+
CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
135+
ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
136+
%24:gr64 = COPY $rax
137+
MOV64mr %3, 1, $noreg, 0, $noreg, %24
138+
%22:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg
139+
%19:gr64_nosp = MOV32ri64 16
140+
%13:tile = PTILELOADDV %41:gr16, %40:gr16, %22, 1, killed %19, 0, $noreg
141+
%14:gr64_nosp = MOV32ri64 64
142+
%15:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
143+
PTILESTOREDV %41:gr16, %40:gr16, killed %15, 1, killed %14, 0, $noreg, %13
144+
%9:gr64_nosp = MOV32ri64 64
145+
%10:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
146+
%4:tile = PTILELOADDV %41:gr16, %40:gr16, killed %10, 1, killed %9, 0, $noreg
147+
%5:gr64_nosp = MOV32ri64 16
148+
%6:gr64 = LEA64r %stack.4, 1, $noreg, 0, $noreg
149+
PTILESTOREDV %41:gr16, %40:gr16, killed %6, 1, killed %5, 0, $noreg, %4
150+
RET64
151+
...

0 commit comments

Comments
 (0)