llvm · LU-JOHN · Jul 22, 2025 · arsenm · Jul 23, 2025 · arsenm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
-  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
 
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+  if (MAI->hasFunctionAlignment()) {
+    Align Alignment = MF->getAlignment();
+    MFI.Alignment = Alignment.value();
+    MFI.Offset = 0;
+  }
+
   // TODO: We're checking this late, would be nice to check it earlier.
   if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
     reportFatalUsageError(
@@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     HexLines.emplace_back("");
   }
   AsmPrinter::emitBasicBlockStart(MBB);
+
+  Align Alignment = MBB.getAlignment();
+  if (Alignment != Align(1)) {
+    const MachineFunction *MF = MBB.getParent();
+    SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+        MF->getInfo<SIMachineFunctionInfo>());
+    unsigned BlockAlignment = Alignment.value();
+    // Do not decrease known Alignment.  Increment Offset to satisfy
+    // BlockAlignment.
+    MFI->Alignment = std::max(MFI->Alignment, BlockAlignment);
+    MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment));
+  }
 }
 
 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
@@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
   return KernelDescriptor;
 }
 
+cl::opt<bool> PreventHalfCacheLineStraddling(
+    "amdgpu-prevent-half-cache-line-straddling", cl::Hidden,
+    cl::desc(
+        "Add NOPs to prevent instructions from straddling half a cache-line"),
+    cl::init(false));
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Init target streamer lazily on the first function so that previous passes
   // can set metadata.
@@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
   // Regular functions just need the basic required instruction alignment.
-  MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
+  // However, align regular functions to half a cache-line 64/2 bytes if
+  // PreventHalfCacheLineStraddling is enabled.
+  MF.setAlignment(MFI->isEntryFunction()           ? Align(256)
+                  : PreventHalfCacheLineStraddling ? Align(32)
+                                                   : Align(4));
 
   SetupMachineFunction(MF);
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
   OS.emitRawComment(" transferring at most " + TransferredRegs);
 }
 
+extern cl::opt<bool> PreventHalfCacheLineStraddling;
+
+static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
+                                     const GCNSubtarget &STI,
+                                     MCContext &OutContext) {
+  SmallVector<MCFixup, 4> Fixups;
+  SmallVector<char, 16> CodeBytes;
+
+  std::unique_ptr<MCCodeEmitter> InstEmitter(
+      createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
+  InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
+  return (CodeBytes.size());
+};
+
 void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // FIXME: Enable feature predicate checks once all the test pass.
   // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
   //                                        getSubtargetInfo().getFeatureBits());
 
+  auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
+                                           const MachineFunction *MF,
+                                           const MCInst &LoweredMCI) -> void {
+    const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
+    SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+        MF->getInfo<SIMachineFunctionInfo>());
+
+    unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
+
+    // getInstrSizeInBytes convervatively overestimates the size of branches due
+    // to a NOP added for the 0x3f offset bug.  Any inaccuracies in instruction
+    // sizes will cause problems when avoiding straddling half cache-line
+    // boundaries. A NOP is usually not added so remove the +4 that was added.
+    if (MI->isBranch() && STI.hasOffset3fBug())
+      InstSizeInBytes -= 4;
+    // Rarely, some MachineInstr do not have accurate instruction sizes.  Try to
+    // calculate the size from the lowered MCInst.
+    else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
+             !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
+               MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
+      InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);
+
+    // FIXME: Workaround bug in V_MADMK_F32 size.
+    if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
+      InstSizeInBytes = 8;
+
+    unsigned Alignment = MFI->Alignment;
+    unsigned Offset = MFI->Offset;
+    constexpr unsigned HalfCacheLineBoundary = 32;
+
+    unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary);
+    Offset %= Boundary;
+
+    if (Offset + InstSizeInBytes > Boundary) {
+      emitAlignment(Align(HalfCacheLineBoundary));
+      // Do not decrease known Alignment.  Increment Offset to satisfy
+      // HalfCacheLineBoundary.
+      MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary);
+      MFI->Offset +=
+          (HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary));
+    }
+    MFI->Offset += InstSizeInBytes;
+  };
+
   if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    if (PreventHalfCacheLineStraddling)
+      AvoidHalfCacheLineBoundary(MI, MF, OutInst);
     EmitToStreamer(*OutStreamer, OutInst);
     return;
   }
@@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
+    if (PreventHalfCacheLineStraddling)
+      AvoidHalfCacheLineBoundary(MI, MF, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
 
 #ifdef EXPENSIVE_CHECKS
@@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // We also overestimate branch sizes with the offset bug.
     if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
-        (!STI.hasOffset3fBug() || !MI->isBranch())) {
-      SmallVector<MCFixup, 4> Fixups;
-      SmallVector<char, 16> CodeBytes;
-
-      std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
-          *STI.getInstrInfo(), OutContext));
-      InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);
-
-      assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
-    }
+        (!STI.hasOffset3fBug() || !MI->isBranch()))
+      assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) ==
+             STI.getInstrInfo()->getInstSizeInBytes(*MI));
 #endif
 
     if (DumpCodeInstEmitter) {

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
 
 public:
+  // Current known instruction alignment and offset in bytes.
+  // Used to prevent instructions from straddling half cache-line boundaries
+  // for performance.
+  unsigned Alignment = 1;
+  unsigned Offset = 0;
+
   struct VGPRSpillToAGPR {
     SmallVector<MCPhysReg, 32> Lanes;
     bool FullyAllocated = false;

diff --git a/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+
+if len(sys.argv) !=2 :
+    print("Usage: has_straddle.py <dissasembly file>")
+    sys.exit(1)
+
+inputFilename = sys.argv[1]
+address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)";
+
+file = open(inputFilename)
+
+for line in file :
+    match = re.search(address_and_encoding_regex,line)
+    if match :
+        hexaddress = match.group(1)
+        encoding = match.group(2)
+        dwords = encoding.split()
+        address = int(hexaddress, 16)
+        address_end = address + len(dwords)*4 - 1
+        #Cache-line is 64 bytes.  Check for half cache-line straddle.
+        if address//32 != address_end//32:
+            print("Straddling instruction found at:")
+            print(line)
+            sys.exit(1)
+
+sys.exit(0)
diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle.ll b/llvm/test/CodeGen/AMDGPU/no_straddle.ll
@@ -0,0 +1,153 @@
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji  -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d  - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <2 x i32>, ptr addrspace(1) %in0
+  %b = load <2 x i32>, ptr addrspace(1) %in1
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <4 x i32>, ptr addrspace(1) %in0
+  %b = load <4 x i32>, ptr addrspace(1) %in1
+  %result = xor <4 x i32> %a, %b
+  store <4 x i32> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load float, ptr addrspace(1) %in0
+  %b = load float, ptr addrspace(1) %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 1.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load volatile i1, ptr addrspace(1) %in0
+  %b = load volatile i1, ptr addrspace(1) %in1
+  %xor = xor i1 %a, %b
+  store i1 %xor, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
+  %result = xor i32 %a, %b
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+  %result = xor i32 %a, %b
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
+  %result = xor i32 %a, -1
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
+  %result = xor i64 %a, %b
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
+  %result = xor i64 %a, -1
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = xor i64 %a, %b
+  br label %endif
+
+else:
+  %2 = load i64, ptr addrspace(1) %in
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, 4261135838621753
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
+  %or = xor i64 %a, 4261135838621753
+  store i64 %or, ptr addrspace(1) %out
+
+  %foo = add i64 %b, 4261135838621753
+  store volatile i64 %foo, ptr addrspace(1) poison
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, 63
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, -8
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
+  %or = xor i64 %loada, -8
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
+  %or = xor i64 %loada, 22470723082367
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}