Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
}

void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
const Function &F = MF->getFunction();

const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
if (MAI->hasFunctionAlignment()) {
Align Alignment = MF->getAlignment();
MFI.Alignment = Alignment.value();
MFI.Offset = 0;
}
Comment on lines +166 to +171
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alignment should be taken directly from the function without interpretation through the MCAsmInfo


// TODO: We're checking this late, would be nice to check it earlier.
if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
reportFatalUsageError(
Expand Down Expand Up @@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
HexLines.emplace_back("");
}
AsmPrinter::emitBasicBlockStart(MBB);

Align Alignment = MBB.getAlignment();
if (Alignment != Align(1)) {
const MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
MF->getInfo<SIMachineFunctionInfo>());
unsigned BlockAlignment = Alignment.value();
// Do not decrease known Alignment. Increment Offset to satisfy
// BlockAlignment.
MFI->Alignment = std::max(MFI->Alignment, BlockAlignment);
MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment));
}
}

void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
Expand Down Expand Up @@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
return KernelDescriptor;
}

cl::opt<bool> PreventHalfCacheLineStraddling(
"amdgpu-prevent-half-cache-line-straddling", cl::Hidden,
cl::desc(
"Add NOPs to prevent instructions from straddling half a cache-line"),
cl::init(false));

bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Init target streamer lazily on the first function so that previous passes
// can set metadata.
Expand All @@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {

// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
// However, align regular functions to half a cache-line 64/2 bytes if
// PreventHalfCacheLineStraddling is enabled.
MF.setAlignment(MFI->isEntryFunction() ? Align(256)
: PreventHalfCacheLineStraddling ? Align(32)
: Align(4));

SetupMachineFunction(MF);

Expand Down
75 changes: 65 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
OS.emitRawComment(" transferring at most " + TransferredRegs);
}

extern cl::opt<bool> PreventHalfCacheLineStraddling;

static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not be creating a temporary MCCodeEmitter to get the size of a single instruction; this was previously under EXPENSIVE_CHECKS for a reason

const GCNSubtarget &STI,
MCContext &OutContext) {
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;

std::unique_ptr<MCCodeEmitter> InstEmitter(
createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
return (CodeBytes.size());
};

void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
// getSubtargetInfo().getFeatureBits());

auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
const MachineFunction *MF,
const MCInst &LoweredMCI) -> void {
const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
MF->getInfo<SIMachineFunctionInfo>());
Comment on lines +300 to +301
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no const_cast


unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MC should never use getInstSizeInBytes


// getInstrSizeInBytes convervatively overestimates the size of branches due
// to a NOP added for the 0x3f offset bug. Any inaccuracies in instruction
// sizes will cause problems when avoiding straddling half cache-line
// boundaries. A NOP is usually not added so remove the +4 that was added.
if (MI->isBranch() && STI.hasOffset3fBug())
InstSizeInBytes -= 4;
// Rarely, some MachineInstr do not have accurate instruction sizes. Try to
// calculate the size from the lowered MCInst.
else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
!(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
Comment on lines +313 to +315
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't require special casing

InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);

// FIXME: Workaround bug in V_MADMK_F32 size.
if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
InstSizeInBytes = 8;
Comment on lines +318 to +320
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Directly fix this


unsigned Alignment = MFI->Alignment;
unsigned Offset = MFI->Offset;
constexpr unsigned HalfCacheLineBoundary = 32;

unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary);
Offset %= Boundary;

if (Offset + InstSizeInBytes > Boundary) {
emitAlignment(Align(HalfCacheLineBoundary));
// Do not decrease known Alignment. Increment Offset to satisfy
// HalfCacheLineBoundary.
MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary);
MFI->Offset +=
(HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary));
}
MFI->Offset += InstSizeInBytes;
};

if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
if (PreventHalfCacheLineStraddling)
AvoidHalfCacheLineBoundary(MI, MF, OutInst);
EmitToStreamer(*OutStreamer, OutInst);
return;
}
Expand Down Expand Up @@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {

MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
if (PreventHalfCacheLineStraddling)
AvoidHalfCacheLineBoundary(MI, MF, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);

#ifdef EXPENSIVE_CHECKS
Expand All @@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
//
// We also overestimate branch sizes with the offset bug.
if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
(!STI.hasOffset3fBug() || !MI->isBranch())) {
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;

std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
*STI.getInstrInfo(), OutContext));
InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);

assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
}
(!STI.hasOffset3fBug() || !MI->isBranch()))
assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) ==
STI.getInstrInfo()->getInstSizeInBytes(*MI));
#endif

if (DumpCodeInstEmitter) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;

public:
// Current known instruction alignment and offset in bytes.
// Used to prevent instructions from straddling half cache-line boundaries
// for performance.
unsigned Alignment = 1;
unsigned Offset = 0;
Comment on lines +528 to +532
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't belong in MachineFunctionInfo. No per-pass state should ever be here


struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
bool FullyAllocated = false;
Expand Down
29 changes: 29 additions & 0 deletions llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unusual and we probably shouldn't need python in these tests. If it's going to exist it needs to be in an Inputs subdirectory


import re
import sys

if len(sys.argv) !=2 :
print("Usage: has_straddle.py <dissasembly file>")
sys.exit(1)

inputFilename = sys.argv[1]
address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)";

file = open(inputFilename)

for line in file :
match = re.search(address_and_encoding_regex,line)
if match :
hexaddress = match.group(1)
encoding = match.group(2)
dwords = encoding.split()
address = int(hexaddress, 16)
address_end = address + len(dwords)*4 - 1
#Cache-line is 64 bytes. Check for half cache-line straddle.
if address//32 != address_end//32:
print("Straddling instruction found at:")
print(line)
sys.exit(1)

sys.exit(0)
153 changes: 153 additions & 0 deletions llvm/test/CodeGen/AMDGPU/no_straddle.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis
;RUN: %python %p/has_cache_straddle.py %t.dis

define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load <2 x i32>, ptr addrspace(1) %in0
%b = load <2 x i32>, ptr addrspace(1) %in1
%result = xor <2 x i32> %a, %b
store <2 x i32> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load <4 x i32>, ptr addrspace(1) %in0
%b = load <4 x i32>, ptr addrspace(1) %in1
%result = xor <4 x i32> %a, %b
store <4 x i32> %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
%bcmp = fcmp oge float %b, 1.000000e+00
%xor = xor i1 %acmp, %bcmp
%result = select i1 %xor, float %a, float %b
store float %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load volatile i1, ptr addrspace(1) %in0
%b = load volatile i1, ptr addrspace(1) %in1
%xor = xor i1 %a, %b
store i1 %xor, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
%result = xor i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
%result = xor i32 %a, -1
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, -1
store i32 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, %b
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
%result = xor i64 %a, %b
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
%result = xor i64 %a, -1
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, -1
store i64 %result, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
entry:
%0 = icmp eq i64 %a, 0
br i1 %0, label %if, label %else

if:
%1 = xor i64 %a, %b
br label %endif

else:
%2 = load i64, ptr addrspace(1) %in
br label %endif

endif:
%3 = phi i64 [%1, %if], [%2, %else]
store i64 %3, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out

%foo = add i64 %b, 4261135838621753
store volatile i64 %foo, ptr addrspace(1) poison
ret void
}

define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = xor i64 %a, 63
store i64 %or, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
%or = xor i64 %a, -8
store i64 %or, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, -8
store i64 %or, ptr addrspace(1) %out
ret void
}

define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, 22470723082367
store i64 %or, ptr addrspace(1) %out
ret void
}
Loading
Loading