-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU] Add option to prevent insns straddling half cache-line boundaries #150239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII, | |
| OS.emitRawComment(" transferring at most " + TransferredRegs); | ||
| } | ||
|
|
||
| extern cl::opt<bool> PreventHalfCacheLineStraddling; | ||
|
|
||
| static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not be creating a temporary MCCodeEmitter to get the size of a single instruction; this was previously under EXPENSIVE_CHECKS for a reason |
||
| const GCNSubtarget &STI, | ||
| MCContext &OutContext) { | ||
| SmallVector<MCFixup, 4> Fixups; | ||
| SmallVector<char, 16> CodeBytes; | ||
|
|
||
| std::unique_ptr<MCCodeEmitter> InstEmitter( | ||
| createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext)); | ||
| InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI); | ||
| return (CodeBytes.size()); | ||
| }; | ||
|
|
||
| void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { | ||
| // FIXME: Enable feature predicate checks once all the test pass. | ||
| // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(), | ||
| // getSubtargetInfo().getFeatureBits()); | ||
|
|
||
| auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI, | ||
| const MachineFunction *MF, | ||
| const MCInst &LoweredMCI) -> void { | ||
| const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); | ||
| SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>( | ||
| MF->getInfo<SIMachineFunctionInfo>()); | ||
|
Comment on lines
+300
to
+301
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no const_cast |
||
|
|
||
| unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MC should never use getInstSizeInBytes |
||
|
|
||
| // getInstrSizeInBytes convervatively overestimates the size of branches due | ||
| // to a NOP added for the 0x3f offset bug. Any inaccuracies in instruction | ||
| // sizes will cause problems when avoiding straddling half cache-line | ||
| // boundaries. A NOP is usually not added so remove the +4 that was added. | ||
| if (MI->isBranch() && STI.hasOffset3fBug()) | ||
| InstSizeInBytes -= 4; | ||
| // Rarely, some MachineInstr do not have accurate instruction sizes. Try to | ||
| // calculate the size from the lowered MCInst. | ||
| else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) && | ||
| !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY || | ||
| MI->getOpcode() == AMDGPU::ATOMIC_FENCE)) | ||
|
Comment on lines
+313
to
+315
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't require special casing |
||
| InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext); | ||
|
|
||
| // FIXME: Workaround bug in V_MADMK_F32 size. | ||
| if (MI->getOpcode() == AMDGPU::V_MADMK_F32) | ||
| InstSizeInBytes = 8; | ||
|
Comment on lines
+318
to
+320
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Directly fix this |
||
|
|
||
| unsigned Alignment = MFI->Alignment; | ||
| unsigned Offset = MFI->Offset; | ||
| constexpr unsigned HalfCacheLineBoundary = 32; | ||
|
|
||
| unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary); | ||
| Offset %= Boundary; | ||
|
|
||
| if (Offset + InstSizeInBytes > Boundary) { | ||
| emitAlignment(Align(HalfCacheLineBoundary)); | ||
| // Do not decrease known Alignment. Increment Offset to satisfy | ||
| // HalfCacheLineBoundary. | ||
| MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary); | ||
| MFI->Offset += | ||
| (HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary)); | ||
| } | ||
| MFI->Offset += InstSizeInBytes; | ||
| }; | ||
|
|
||
| if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) { | ||
| if (PreventHalfCacheLineStraddling) | ||
| AvoidHalfCacheLineBoundary(MI, MF, OutInst); | ||
| EmitToStreamer(*OutStreamer, OutInst); | ||
| return; | ||
| } | ||
|
|
@@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
|
|
||
| MCInst TmpInst; | ||
| MCInstLowering.lower(MI, TmpInst); | ||
| if (PreventHalfCacheLineStraddling) | ||
| AvoidHalfCacheLineBoundary(MI, MF, TmpInst); | ||
| EmitToStreamer(*OutStreamer, TmpInst); | ||
|
|
||
| #ifdef EXPENSIVE_CHECKS | ||
|
|
@@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { | |
| // | ||
| // We also overestimate branch sizes with the offset bug. | ||
| if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) && | ||
| (!STI.hasOffset3fBug() || !MI->isBranch())) { | ||
| SmallVector<MCFixup, 4> Fixups; | ||
| SmallVector<char, 16> CodeBytes; | ||
|
|
||
| std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter( | ||
| *STI.getInstrInfo(), OutContext)); | ||
| InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI); | ||
|
|
||
| assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI)); | ||
| } | ||
| (!STI.hasOffset3fBug() || !MI->isBranch())) | ||
| assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) == | ||
| STI.getInstrInfo()->getInstSizeInBytes(*MI)); | ||
| #endif | ||
|
|
||
| if (DumpCodeInstEmitter) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction, | |
| void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; | ||
|
|
||
| public: | ||
| // Current known instruction alignment and offset in bytes. | ||
| // Used to prevent instructions from straddling half cache-line boundaries | ||
| // for performance. | ||
| unsigned Alignment = 1; | ||
| unsigned Offset = 0; | ||
|
Comment on lines
+528
to
+532
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't belong in MachineFunctionInfo. No per-pass state should ever be here |
||
|
|
||
| struct VGPRSpillToAGPR { | ||
| SmallVector<MCPhysReg, 32> Lanes; | ||
| bool FullyAllocated = false; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| #!/usr/bin/env python3 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unusual and we probably shouldn't need python in these tests. If it's going to exist it needs to be in an Inputs subdirectory |
||
|
|
||
| import re | ||
| import sys | ||
|
|
||
| if len(sys.argv) !=2 : | ||
| print("Usage: has_straddle.py <dissasembly file>") | ||
| sys.exit(1) | ||
|
|
||
| inputFilename = sys.argv[1] | ||
| address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)"; | ||
|
|
||
| file = open(inputFilename) | ||
|
|
||
| for line in file : | ||
| match = re.search(address_and_encoding_regex,line) | ||
| if match : | ||
| hexaddress = match.group(1) | ||
| encoding = match.group(2) | ||
| dwords = encoding.split() | ||
| address = int(hexaddress, 16) | ||
| address_end = address + len(dwords)*4 - 1 | ||
| #Cache-line is 64 bytes. Check for half cache-line straddle. | ||
| if address//32 != address_end//32: | ||
| print("Straddling instruction found at:") | ||
| print(line) | ||
| sys.exit(1) | ||
|
|
||
| sys.exit(0) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,153 @@ | ||
| ;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis | ||
| ;RUN: %python %p/has_cache_straddle.py %t.dis | ||
|
|
||
| define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load <2 x i32>, ptr addrspace(1) %in0 | ||
| %b = load <2 x i32>, ptr addrspace(1) %in1 | ||
| %result = xor <2 x i32> %a, %b | ||
| store <2 x i32> %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load <4 x i32>, ptr addrspace(1) %in0 | ||
| %b = load <4 x i32>, ptr addrspace(1) %in1 | ||
| %result = xor <4 x i32> %a, %b | ||
| store <4 x i32> %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load float, ptr addrspace(1) %in0 | ||
| %b = load float, ptr addrspace(1) %in1 | ||
| %acmp = fcmp oge float %a, 0.000000e+00 | ||
| %bcmp = fcmp oge float %b, 1.000000e+00 | ||
| %xor = xor i1 %acmp, %bcmp | ||
| %result = select i1 %xor, float %a, float %b | ||
| store float %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load volatile i1, ptr addrspace(1) %in0 | ||
| %b = load volatile i1, ptr addrspace(1) %in1 | ||
| %xor = xor i1 %a, %b | ||
| store i1 %xor, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load i32, ptr addrspace(1) %in0 | ||
| %b = load i32, ptr addrspace(1) %in1 | ||
| %result = xor i32 %a, %b | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { | ||
| %result = xor i32 %a, %b | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) { | ||
| %result = xor i32 %a, -1 | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load i32, ptr addrspace(1) %in0 | ||
| %b = load i32, ptr addrspace(1) %in1 | ||
| %result = xor i32 %a, -1 | ||
| store i32 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load i64, ptr addrspace(1) %in0 | ||
| %b = load i64, ptr addrspace(1) %in1 | ||
| %result = xor i64 %a, %b | ||
| store i64 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { | ||
| %result = xor i64 %a, %b | ||
| store i64 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) { | ||
| %result = xor i64 %a, -1 | ||
| store i64 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { | ||
| %a = load i64, ptr addrspace(1) %in0 | ||
| %b = load i64, ptr addrspace(1) %in1 | ||
| %result = xor i64 %a, -1 | ||
| store i64 %result, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) { | ||
| entry: | ||
| %0 = icmp eq i64 %a, 0 | ||
| br i1 %0, label %if, label %else | ||
|
|
||
| if: | ||
| %1 = xor i64 %a, %b | ||
| br label %endif | ||
|
|
||
| else: | ||
| %2 = load i64, ptr addrspace(1) %in | ||
| br label %endif | ||
|
|
||
| endif: | ||
| %3 = phi i64 [%1, %if], [%2, %else] | ||
| store i64 %3, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { | ||
| %or = xor i64 %a, 4261135838621753 | ||
| store i64 %or, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) { | ||
| %or = xor i64 %a, 4261135838621753 | ||
| store i64 %or, ptr addrspace(1) %out | ||
|
|
||
| %foo = add i64 %b, 4261135838621753 | ||
| store volatile i64 %foo, ptr addrspace(1) poison | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { | ||
| %or = xor i64 %a, 63 | ||
| store i64 %or, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { | ||
| %or = xor i64 %a, -8 | ||
| store i64 %or, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { | ||
| %loada = load i64, ptr addrspace(1) %a, align 8 | ||
| %or = xor i64 %loada, -8 | ||
| store i64 %or, ptr addrspace(1) %out | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { | ||
| %loada = load i64, ptr addrspace(1) %a, align 8 | ||
| %or = xor i64 %loada, 22470723082367 | ||
| store i64 %or, ptr addrspace(1) %out | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The alignment should be taken directly from the function without interpretation through the MCAsmInfo