Skip to content

Commit fa4f595

Browse files
committed
Add option to prevent insns straddling half cache-line boundaries
Signed-off-by: John Lu <[email protected]>
1 parent 9b5959d commit fa4f595

19 files changed

+11195
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
159159
}
160160

161161
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
162-
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
162+
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
163163
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
164164
const Function &F = MF->getFunction();
165165

166+
const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
167+
if (MAI->hasFunctionAlignment()) {
168+
Align Alignment = MF->getAlignment();
169+
MFI.Alignment = Alignment.value();
170+
MFI.Offset = 0;
171+
}
172+
166173
// TODO: We're checking this late, would be nice to check it earlier.
167174
if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
168175
reportFatalUsageError(
@@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
298305
HexLines.emplace_back("");
299306
}
300307
AsmPrinter::emitBasicBlockStart(MBB);
308+
309+
Align Alignment = MBB.getAlignment();
310+
if (Alignment != Align(1)) {
311+
const MachineFunction *MF = MBB.getParent();
312+
SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
313+
MF->getInfo<SIMachineFunctionInfo>());
314+
unsigned BlockAlignment = Alignment.value();
315+
// Do not decrease known Alignment. Increment Offset to satisfy
316+
// BlockAlignment.
317+
MFI->Alignment = std::max(MFI->Alignment, BlockAlignment);
318+
MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment));
319+
}
301320
}
302321

303322
void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
@@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
640659
return KernelDescriptor;
641660
}
642661

662+
cl::opt<bool> PreventHalfCacheLineStraddling(
663+
"amdgpu-prevent-half-cache-line-straddling", cl::Hidden,
664+
cl::desc(
665+
"Add NOPs to prevent instructions from straddling half a cache-line"),
666+
cl::init(false));
667+
643668
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
644669
// Init target streamer lazily on the first function so that previous passes
645670
// can set metadata.
@@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
654679

655680
// The starting address of all shader programs must be 256 bytes aligned.
656681
// Regular functions just need the basic required instruction alignment.
657-
MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
682+
// However, align regular functions to half a cache-line 64/2 bytes if
683+
// PreventHalfCacheLineStraddling is enabled.
684+
MF.setAlignment(MFI->isEntryFunction() ? Align(256)
685+
: PreventHalfCacheLineStraddling ? Align(32)
686+
: Align(4));
658687

659688
SetupMachineFunction(MF);
660689

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
274274
OS.emitRawComment(" transferring at most " + TransferredRegs);
275275
}
276276

277+
extern cl::opt<bool> PreventHalfCacheLineStraddling;
278+
279+
static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
280+
const GCNSubtarget &STI,
281+
MCContext &OutContext) {
282+
SmallVector<MCFixup, 4> Fixups;
283+
SmallVector<char, 16> CodeBytes;
284+
285+
std::unique_ptr<MCCodeEmitter> InstEmitter(
286+
createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
287+
InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
288+
return (CodeBytes.size());
289+
};
290+
277291
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
278292
// FIXME: Enable feature predicate checks once all the test pass.
279293
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
280294
// getSubtargetInfo().getFeatureBits());
281295

296+
auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
297+
const MachineFunction *MF,
298+
const MCInst &LoweredMCI) -> void {
299+
const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
300+
SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
301+
MF->getInfo<SIMachineFunctionInfo>());
302+
303+
unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
304+
305+
// getInstrSizeInBytes convervatively overestimates the size of branches due
306+
// to a NOP added for the 0x3f offset bug. Any inaccuracies in instruction
307+
// sizes will cause problems when avoiding straddling half cache-line
308+
// boundaries. A NOP is usually not added so remove the +4 that was added.
309+
if (MI->isBranch() && STI.hasOffset3fBug())
310+
InstSizeInBytes -= 4;
311+
// Rarely, some MachineInstr do not have accurate instruction sizes. Try to
312+
// calculate the size from the lowered MCInst.
313+
else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
314+
!(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
315+
MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
316+
InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);
317+
318+
// FIXME: Workaround bug in V_MADMK_F32 size.
319+
if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
320+
InstSizeInBytes = 8;
321+
322+
unsigned Alignment = MFI->Alignment;
323+
unsigned Offset = MFI->Offset;
324+
constexpr unsigned HalfCacheLineBoundary = 32;
325+
326+
unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary);
327+
Offset %= Boundary;
328+
329+
if (Offset + InstSizeInBytes > Boundary) {
330+
emitAlignment(Align(HalfCacheLineBoundary));
331+
// Do not decrease known Alignment. Increment Offset to satisfy
332+
// HalfCacheLineBoundary.
333+
MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary);
334+
MFI->Offset +=
335+
(HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary));
336+
}
337+
MFI->Offset += InstSizeInBytes;
338+
};
339+
282340
if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
341+
if (PreventHalfCacheLineStraddling)
342+
AvoidHalfCacheLineBoundary(MI, MF, OutInst);
283343
EmitToStreamer(*OutStreamer, OutInst);
284344
return;
285345
}
@@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
370430

371431
MCInst TmpInst;
372432
MCInstLowering.lower(MI, TmpInst);
433+
if (PreventHalfCacheLineStraddling)
434+
AvoidHalfCacheLineBoundary(MI, MF, TmpInst);
373435
EmitToStreamer(*OutStreamer, TmpInst);
374436

375437
#ifdef EXPENSIVE_CHECKS
@@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
382444
//
383445
// We also overestimate branch sizes with the offset bug.
384446
if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
385-
(!STI.hasOffset3fBug() || !MI->isBranch())) {
386-
SmallVector<MCFixup, 4> Fixups;
387-
SmallVector<char, 16> CodeBytes;
388-
389-
std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
390-
*STI.getInstrInfo(), OutContext));
391-
InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);
392-
393-
assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
394-
}
447+
(!STI.hasOffset3fBug() || !MI->isBranch()))
448+
assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) ==
449+
STI.getInstrInfo()->getInstSizeInBytes(*MI));
395450
#endif
396451

397452
if (DumpCodeInstEmitter) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
525525
void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
526526

527527
public:
528+
// Current known instruction alignment and offset in bytes.
529+
// Used to prevent instructions from straddling half cache-line boundaries
530+
// for performance.
531+
unsigned Alignment = 1;
532+
unsigned Offset = 0;
533+
528534
struct VGPRSpillToAGPR {
529535
SmallVector<MCPhysReg, 32> Lanes;
530536
bool FullyAllocated = false;
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python3
2+
3+
import re
4+
import sys
5+
6+
if len(sys.argv) !=2 :
7+
print("Usage: has_straddle.py <dissasembly file>")
8+
sys.exit(1)
9+
10+
inputFilename = sys.argv[1]
11+
address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)";
12+
13+
file = open(inputFilename)
14+
15+
for line in file :
16+
match = re.search(address_and_encoding_regex,line)
17+
if match :
18+
hexaddress = match.group(1)
19+
encoding = match.group(2)
20+
dwords = encoding.split()
21+
address = int(hexaddress, 16)
22+
address_end = address + len(dwords)*4 - 1
23+
#Cache-line is 64 bytes. Check for half cache-line straddle.
24+
if address//32 != address_end//32:
25+
print("Straddling instruction found at:")
26+
print(line)
27+
sys.exit(1)
28+
29+
sys.exit(0)
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis
2+
;RUN: %python %p/has_cache_straddle.py %t.dis
3+
4+
define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
5+
%a = load <2 x i32>, ptr addrspace(1) %in0
6+
%b = load <2 x i32>, ptr addrspace(1) %in1
7+
%result = xor <2 x i32> %a, %b
8+
store <2 x i32> %result, ptr addrspace(1) %out
9+
ret void
10+
}
11+
12+
define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
13+
%a = load <4 x i32>, ptr addrspace(1) %in0
14+
%b = load <4 x i32>, ptr addrspace(1) %in1
15+
%result = xor <4 x i32> %a, %b
16+
store <4 x i32> %result, ptr addrspace(1) %out
17+
ret void
18+
}
19+
20+
define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
21+
%a = load float, ptr addrspace(1) %in0
22+
%b = load float, ptr addrspace(1) %in1
23+
%acmp = fcmp oge float %a, 0.000000e+00
24+
%bcmp = fcmp oge float %b, 1.000000e+00
25+
%xor = xor i1 %acmp, %bcmp
26+
%result = select i1 %xor, float %a, float %b
27+
store float %result, ptr addrspace(1) %out
28+
ret void
29+
}
30+
31+
define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
32+
%a = load volatile i1, ptr addrspace(1) %in0
33+
%b = load volatile i1, ptr addrspace(1) %in1
34+
%xor = xor i1 %a, %b
35+
store i1 %xor, ptr addrspace(1) %out
36+
ret void
37+
}
38+
39+
define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
40+
%a = load i32, ptr addrspace(1) %in0
41+
%b = load i32, ptr addrspace(1) %in1
42+
%result = xor i32 %a, %b
43+
store i32 %result, ptr addrspace(1) %out
44+
ret void
45+
}
46+
47+
define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
48+
%result = xor i32 %a, %b
49+
store i32 %result, ptr addrspace(1) %out
50+
ret void
51+
}
52+
53+
define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
54+
%result = xor i32 %a, -1
55+
store i32 %result, ptr addrspace(1) %out
56+
ret void
57+
}
58+
59+
define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
60+
%a = load i32, ptr addrspace(1) %in0
61+
%b = load i32, ptr addrspace(1) %in1
62+
%result = xor i32 %a, -1
63+
store i32 %result, ptr addrspace(1) %out
64+
ret void
65+
}
66+
67+
define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
68+
%a = load i64, ptr addrspace(1) %in0
69+
%b = load i64, ptr addrspace(1) %in1
70+
%result = xor i64 %a, %b
71+
store i64 %result, ptr addrspace(1) %out
72+
ret void
73+
}
74+
75+
define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
76+
%result = xor i64 %a, %b
77+
store i64 %result, ptr addrspace(1) %out
78+
ret void
79+
}
80+
81+
define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
82+
%result = xor i64 %a, -1
83+
store i64 %result, ptr addrspace(1) %out
84+
ret void
85+
}
86+
87+
define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
88+
%a = load i64, ptr addrspace(1) %in0
89+
%b = load i64, ptr addrspace(1) %in1
90+
%result = xor i64 %a, -1
91+
store i64 %result, ptr addrspace(1) %out
92+
ret void
93+
}
94+
95+
define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
96+
entry:
97+
%0 = icmp eq i64 %a, 0
98+
br i1 %0, label %if, label %else
99+
100+
if:
101+
%1 = xor i64 %a, %b
102+
br label %endif
103+
104+
else:
105+
%2 = load i64, ptr addrspace(1) %in
106+
br label %endif
107+
108+
endif:
109+
%3 = phi i64 [%1, %if], [%2, %else]
110+
store i64 %3, ptr addrspace(1) %out
111+
ret void
112+
}
113+
114+
define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
115+
%or = xor i64 %a, 4261135838621753
116+
store i64 %or, ptr addrspace(1) %out
117+
ret void
118+
}
119+
120+
define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
121+
%or = xor i64 %a, 4261135838621753
122+
store i64 %or, ptr addrspace(1) %out
123+
124+
%foo = add i64 %b, 4261135838621753
125+
store volatile i64 %foo, ptr addrspace(1) poison
126+
ret void
127+
}
128+
129+
define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
130+
%or = xor i64 %a, 63
131+
store i64 %or, ptr addrspace(1) %out
132+
ret void
133+
}
134+
135+
define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
136+
%or = xor i64 %a, -8
137+
store i64 %or, ptr addrspace(1) %out
138+
ret void
139+
}
140+
141+
define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
142+
%loada = load i64, ptr addrspace(1) %a, align 8
143+
%or = xor i64 %loada, -8
144+
store i64 %or, ptr addrspace(1) %out
145+
ret void
146+
}
147+
148+
define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
149+
%loada = load i64, ptr addrspace(1) %a, align 8
150+
%or = xor i64 %loada, 22470723082367
151+
store i64 %or, ptr addrspace(1) %out
152+
ret void
153+
}

0 commit comments

Comments
 (0)