Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4bbf382
[llvm][RISCV] Implement Zilsd load/store pair optimization
4vtomat Sep 10, 2025
5722750
fixup! fix comments, move post-ra zilsd to RISCVLoadStoreOptimizer
4vtomat Sep 25, 2025
569a86c
Merge remote-tracking branch 'origin/main' into zilsd-optimizer
4vtomat Sep 25, 2025
ce8fc11
fixup! rename simm12 to simm12_lo
4vtomat Sep 25, 2025
2ff058f
fixup! negative test case, handle symbols
4vtomat Oct 7, 2025
a4cca81
Merge remote-tracking branch 'origin/main' into zilsd-optimizer
4vtomat Oct 8, 2025
294d83e
fixup! unused function
4vtomat Oct 13, 2025
14c0463
fixup! group memop kind, alignment, remove external symbol
4vtomat Oct 22, 2025
57c10b5
fixup! change MCPhysReg to MCRegister
4vtomat Oct 27, 2025
7c9b744
fixup! minor change
4vtomat Oct 30, 2025
90f3aa4
Merge remote-tracking branch 'origin/main' into zilsd-optimizer
4vtomat Nov 10, 2025
ed57a2c
fixup! clang format
4vtomat Nov 10, 2025
fff9d4b
fixup! fix prera alignment and simplify the code
4vtomat Nov 14, 2025
483104e
fixup! fix postra and simplify the code
4vtomat Nov 14, 2025
66d95f1
fixup! PseudoLD_RV32_OPT early clobber
4vtomat Nov 14, 2025
c4c4925
fixup! hint test
4vtomat Nov 14, 2025
7c377e3
fixup! dead/kill flag, alignment, remove early-clobber, required prop…
4vtomat Nov 20, 2025
fe67c22
fixup! clang-format
4vtomat Nov 20, 2025
696bad8
fixup! update test
4vtomat Nov 20, 2025
d79e21c
fixup! update test comments
4vtomat Nov 20, 2025
2a87065
fixup! update test comment
4vtomat Nov 20, 2025
44f5809
fixup! skip preserved register
4vtomat Nov 21, 2025
9453b78
fixup! add TODO
4vtomat Nov 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ add_llvm_target(RISCVCodeGen
RISCVVLOptimizer.cpp
RISCVVMV0Elimination.cpp
RISCVZacasABIFix.cpp
RISCVZilsdOptimizer.cpp
GISel/RISCVCallLowering.cpp
GISel/RISCVInstructionSelector.cpp
GISel/RISCVLegalizerInfo.cpp
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/RISCV/RISCV.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ void initializeRISCVPushPopOptPass(PassRegistry &);
FunctionPass *createRISCVLoadStoreOptPass();
void initializeRISCVLoadStoreOptPass(PassRegistry &);

FunctionPass *createRISCVPreAllocZilsdOptPass();
void initializeRISCVPreAllocZilsdOptPass(PassRegistry &);

FunctionPass *createRISCVZacasABIFixPass();
void initializeRISCVZacasABIFixPass(PassRegistry &);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/RISCV/RISCVFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ def HasStdExtZilsd : Predicate<"Subtarget->hasStdExtZilsd()">,
AssemblerPredicate<(all_of FeatureStdExtZilsd),
"'Zilsd' (Load/Store pair instructions)">;

def FeatureZilsd4ByteAlign
: SubtargetFeature<"zilsd-4byte-align", "AllowZilsd4ByteAlign", "true",
"Allow 4-byte alignment for Zilsd LD/SD instructions">;

// Multiply Extensions

def FeatureStdExtZmmul
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfoZilsd.td
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,23 @@ let Predicates = [HasStdExtZilsd, IsRV32] in {
def PseudoLD_RV32 : PseudoLoad<"ld", GPRPairRV32>;
def PseudoSD_RV32 : PseudoStore<"sd", GPRPairRV32>;

// Pseudo instructions for load/store optimization with 2 separate registers
def PseudoLD_RV32_OPT :
Pseudo<(outs GPR:$rd1, GPR:$rd2),
(ins GPR:$rs1, simm12_lo:$imm12), [], "", ""> {
let hasSideEffects = 0;
let mayLoad = 1;
let mayStore = 0;
}

def PseudoSD_RV32_OPT :
Pseudo<(outs),
(ins GPR:$rs1, GPR:$rs2, GPR:$rs3, simm12_lo:$imm12), [], "", ""> {
let hasSideEffects = 0;
let mayLoad = 0;
let mayStore = 1;
}

def : InstAlias<"ld $rd, (${rs1})", (LD_RV32 GPRPairRV32:$rd, GPR:$rs1, 0), 0>;
def : InstAlias<"sd $rs2, (${rs1})", (SD_RV32 GPRPairRV32:$rs2, GPR:$rs1, 0), 0>;
}
263 changes: 252 additions & 11 deletions llvm/lib/Target/RISCV/RISCVLoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
// paired instruction, leveraging hardware support for paired memory accesses.
// Much of the pairing logic is adapted from the AArch64LoadStoreOpt pass.
//
// Post-allocation Zilsd decomposition: Fixes invalid LD/SD instructions if
// register allocation didn't provide suitable consecutive registers.
//
// NOTE: The AArch64LoadStoreOpt pass performs additional optimizations such as
// merging zero store instructions, promoting loads that read directly from a
// preceding store, and merging base register updates with load/store
Expand All @@ -23,6 +26,7 @@

#include "RISCV.h"
#include "RISCVTargetMachine.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/MC/TargetRegistry.h"
Expand All @@ -38,6 +42,8 @@ using namespace llvm;
// pairs.
static cl::opt<unsigned> LdStLimit("riscv-load-store-scan-limit", cl::init(128),
cl::Hidden);
STATISTIC(NumLD2LW, "Number of LD instructions split back to LW");
STATISTIC(NumSD2SW, "Number of SD instructions split back to SW");

namespace {

Expand Down Expand Up @@ -75,6 +81,13 @@ struct RISCVLoadStoreOpt : public MachineFunctionPass {
mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired, bool MergeForward);

// Post reg-alloc zilsd part
bool fixInvalidRegPairOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI);
bool isConsecutiveRegPair(Register First, Register Second);
void splitLdSdIntoTwo(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI, bool IsLoad);

private:
AliasAnalysis *AA;
MachineRegisterInfo *MRI;
Expand All @@ -91,9 +104,8 @@ INITIALIZE_PASS(RISCVLoadStoreOpt, DEBUG_TYPE, RISCV_LOAD_STORE_OPT_NAME, false,
bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(Fn.getFunction()))
return false;

const RISCVSubtarget &Subtarget = Fn.getSubtarget<RISCVSubtarget>();
if (!Subtarget.useMIPSLoadStorePairs())
return false;

bool MadeChange = false;
TII = Subtarget.getInstrInfo();
Expand All @@ -103,18 +115,34 @@ bool RISCVLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
ModifiedRegUnits.init(*TRI);
UsedRegUnits.init(*TRI);

for (MachineBasicBlock &MBB : Fn) {
LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
if (Subtarget.useMIPSLoadStorePairs()) {
for (MachineBasicBlock &MBB : Fn) {
LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");

for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
if (TII->isPairableLdStInstOpc(MBBI->getOpcode()) &&
tryToPairLdStInst(MBBI))
MadeChange = true;
else
++MBBI;
}
}
}

for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
if (TII->isPairableLdStInstOpc(MBBI->getOpcode()) &&
tryToPairLdStInst(MBBI))
MadeChange = true;
else
++MBBI;
if (!Subtarget.is64Bit() && Subtarget.hasStdExtZilsd()) {
for (auto &MBB : Fn) {
for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
if (fixInvalidRegPairOp(MBB, MBBI)) {
MadeChange = true;
// Iterator was updated by fixInvalidRegPairOp
} else {
++MBBI;
}
}
}
}

return MadeChange;
}

Expand Down Expand Up @@ -395,6 +423,219 @@ RISCVLoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
return NextI;
}

//===----------------------------------------------------------------------===//
// Post reg-alloc zilsd pass implementation
//===----------------------------------------------------------------------===//

bool RISCVLoadStoreOpt::isConsecutiveRegPair(Register First, Register Second) {
// Special case: First register can not be zero
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this comment matches the code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed!

// zeros
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Special case: First register can not be zero
// zeros
// Special case: First register can not be zero

if (First == RISCV::X0)
return true;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to check that Second is also X0?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think we should

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code here isn't great, but this does now do the correct thing.

  if (First == X0)
    return (Second == x0);

Would be equivalent to these two initial ifs and simpler.


// Check if registers form a valid even/odd pair for Zilsd
unsigned FirstNum = TRI->getEncodingValue(First);
unsigned SecondNum = TRI->getEncodingValue(Second);

// Must be consecutive and first must be even
return (FirstNum % 2 == 0) && (SecondNum == FirstNum + 1);
}

void RISCVLoadStoreOpt::splitLdSdIntoTwo(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
bool IsLoad) {
MachineInstr *MI = &*MBBI;
DebugLoc DL = MI->getDebugLoc();

Register FirstReg = MI->getOperand(0).getReg();
Register SecondReg = MI->getOperand(1).getReg();
Register BaseReg = MI->getOperand(2).getReg();

// Handle both immediate and symbolic operands for offset
const MachineOperand &OffsetOp = MI->getOperand(3);
int BaseOffset;
if (OffsetOp.isImm())
BaseOffset = OffsetOp.getImm();
else
// For symbolic operands, extract the embedded offset
BaseOffset = OffsetOp.getOffset();

unsigned Opc = IsLoad ? RISCV::LW : RISCV::SW;

// Create two separate instructions
if (IsLoad) {
auto MIB1 = BuildMI(MBB, MBBI, DL, TII->get(Opc))
.addReg(FirstReg, RegState::Define)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for BaseReg to be the same as FirstReg? If that happens the next instruction will get the wrong value for its base.

Copy link
Collaborator

@topperc topperc Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test case that miscompiles with -mattr=+zilsd,+unaligned-scalar-mem

declare void @bar(i32, i32, i32)                                                 
                                                                                 
define void @foo(ptr %x) {                                                       
  %a = load i32, ptr %x, align 4                                                 
  %b = getelementptr i32, ptr %x, i32 1                                          
  %c = load i32, ptr %b, align 4                                                 
  call void @bar(i32 %a, i32 1, i32 %c)                                          
  ret void                                                                       
}

It produces

        lw      a0, 0(a0)
        lw      a2, 4(a0)

The second load is using the result of the first as a pointer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, it's possible to be same, I just use early clobber to resolve this but not sure it's too aggressive?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the other choice is to just emit the two loads in the opposite order. i.e. for the case above,

  lw a2, 4(a0)
  lw a0, 2(a0)

Would be acceptable, and would prevent needing the earlyclobber. This could be a follow-up change though.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can pass FirstReg as a 5th argument to BuildMI instead of explicitly using .addReg(FirstReg, RegState::Define)

.addReg(BaseReg);

auto MIB2 = BuildMI(MBB, MBBI, DL, TII->get(Opc))
.addReg(SecondReg, RegState::Define)
.addReg(BaseReg);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should preserve the Kill flag for base register.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that right!


// Add offset operands - preserve symbolic references
if (OffsetOp.isImm()) {
MIB1.addImm(BaseOffset);
MIB2.addImm(BaseOffset + 4);
} else if (OffsetOp.isGlobal()) {
MIB1.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset + 4,
OffsetOp.getTargetFlags());
} else if (OffsetOp.isCPI()) {
MIB1.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset + 4,
OffsetOp.getTargetFlags());
} else if (OffsetOp.isSymbol()) {
MIB1.addExternalSymbol(OffsetOp.getSymbolName(),
OffsetOp.getTargetFlags());
MIB2.addExternalSymbol(OffsetOp.getSymbolName(),
OffsetOp.getTargetFlags());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can get here, as the offset of an external symbol is always 0.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@4vtomat this was not addressed. And if it can happen, it would be incorrect. As now we would have 2 4-byte loads/stores accessing the same memory. We would need an offset for the second access to maintain the behavior of the paired instruction.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry about that, I just forgot to push some fixes and I thought I did..
I think it can be removed lol

} else if (OffsetOp.isBlockAddress()) {
MIB1.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset + 4,
OffsetOp.getTargetFlags());
}

// Copy memory operands if the original instruction had them
// FIXME: This is overly conservative; the new instruction accesses 4 bytes,
// not 8.
if (MI->memoperands_begin() != MI->memoperands_end()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use memoperands_empty()? Though I'm not sure why we need this check at all. Doesn't cloneMemRefs work even if there are no memory operands?

MIB1.cloneMemRefs(*MI);
MIB2.cloneMemRefs(*MI);
}

++NumLD2LW;
LLVM_DEBUG(dbgs() << "Split LD back to two LW instructions\n");
} else {
auto MIB1 =
BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(FirstReg).addReg(BaseReg);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should preserve the kill flags for the registers. Need to be careful if FirstReg and SecondReg are the same register.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we already ensure FirstReg and SecondReg is not same register in pre-ra pass, I think there's no something like CSE happened between pre-ra pass and post-ra pass?


auto MIB2 =
BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SecondReg).addReg(BaseReg);

// Add offset operands - preserve symbolic references
if (OffsetOp.isImm()) {
MIB1.addImm(BaseOffset);
MIB2.addImm(BaseOffset + 4);
} else if (OffsetOp.isGlobal()) {
MIB1.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset + 4,
OffsetOp.getTargetFlags());
} else if (OffsetOp.isCPI()) {
MIB1.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset + 4,
OffsetOp.getTargetFlags());
} else if (OffsetOp.isSymbol()) {
MIB1.addExternalSymbol(OffsetOp.getSymbolName(),
OffsetOp.getTargetFlags());
MIB2.addExternalSymbol(OffsetOp.getSymbolName(),
OffsetOp.getTargetFlags());
} else if (OffsetOp.isBlockAddress()) {
MIB1.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset,
OffsetOp.getTargetFlags());
MIB2.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset + 4,
OffsetOp.getTargetFlags());
}

// Copy memory operands if the original instruction had them
// FIXME: This is overly conservative; the new instruction accesses 4 bytes,
// not 8.
if (MI->memoperands_begin() != MI->memoperands_end()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use memoperands_empty()? Though I'm not sure why we need this check at all. Doesn't cloneMemRefs work even if there are no memory operands?

MIB1.cloneMemRefs(*MI);
MIB2.cloneMemRefs(*MI);
}

++NumSD2SW;
LLVM_DEBUG(dbgs() << "Split SD back to two SW instructions\n");
}

// Remove the original paired instruction and update iterator
MBBI = MBB.erase(MBBI);
}

bool RISCVLoadStoreOpt::fixInvalidRegPairOp(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) {
MachineInstr *MI = &*MBBI;
unsigned Opcode = MI->getOpcode();

// Check if this is a Zilsd pseudo that needs fixing
if (Opcode != RISCV::PseudoLD_RV32_OPT && Opcode != RISCV::PseudoSD_RV32_OPT)
return false;

bool IsLoad = (Opcode == RISCV::PseudoLD_RV32_OPT);

Register FirstReg = MI->getOperand(0).getReg();
Register SecondReg = MI->getOperand(1).getReg();

// Check if we have valid consecutive registers
if (!isConsecutiveRegPair(FirstReg, SecondReg)) {
// Need to split back into two instructions
splitLdSdIntoTwo(MBB, MBBI, IsLoad);
return true;
}

// Registers are valid, convert to real LD/SD instruction
Register BaseReg = MI->getOperand(2).getReg();
DebugLoc DL = MI->getDebugLoc();
// Handle both immediate and symbolic operands for offset
const MachineOperand &OffsetOp = MI->getOperand(3);
int BaseOffset;
if (OffsetOp.isImm())
BaseOffset = OffsetOp.getImm();
else
// For symbolic operands, extract the embedded offset
BaseOffset = OffsetOp.getOffset();

unsigned RealOpc = IsLoad ? RISCV::LD_RV32 : RISCV::SD_RV32;

// Create register pair from the two individual registers
unsigned RegPair = TRI->getMatchingSuperReg(FirstReg, RISCV::sub_gpr_even,
&RISCV::GPRPairRegClass);
// Create the real LD/SD instruction with register pair
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(RealOpc));

if (IsLoad) {
// For LD, the register pair is the destination
MIB.addReg(RegPair, RegState::Define);
} else {
// For SD, the register pair is the source
MIB.addReg(RegPair);
}

MIB.addReg(BaseReg);

// Add offset operand - preserve symbolic references
if (OffsetOp.isImm())
MIB.addImm(BaseOffset);
else if (OffsetOp.isGlobal())
MIB.addGlobalAddress(OffsetOp.getGlobal(), BaseOffset,
OffsetOp.getTargetFlags());
else if (OffsetOp.isCPI())
MIB.addConstantPoolIndex(OffsetOp.getIndex(), BaseOffset,
OffsetOp.getTargetFlags());
else if (OffsetOp.isSymbol())
MIB.addExternalSymbol(OffsetOp.getSymbolName(), OffsetOp.getTargetFlags());
else if (OffsetOp.isBlockAddress())
MIB.addBlockAddress(OffsetOp.getBlockAddress(), BaseOffset,
OffsetOp.getTargetFlags());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you bothering? Just use MIB.add(OffsetOp) I think.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@4vtomat was this addressed?

Copy link
Member Author

@4vtomat 4vtomat Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressed!


// Copy memory operands if the original instruction had them
if (MI->memoperands_begin() != MI->memoperands_end())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use memoperands_empty()? Though I'm not sure why we need this check at all. Doesn't cloneMemRefs work even if there are no memory operands?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can just remove the check lol

MIB.cloneMemRefs(*MI);

LLVM_DEBUG(dbgs() << "Converted pseudo to real instruction: " << *MIB
<< "\n");

// Remove the pseudo instruction and update iterator
MBBI = MBB.erase(MBBI);

return true;
}

// Returns an instance of the Load / Store Optimization pass.
FunctionPass *llvm::createRISCVLoadStoreOptPass() {
return new RISCVLoadStoreOpt();
Expand Down
Loading