Skip to content

Commit 5e4f842

Browse files
committed
Fix up parallel moves
1 parent 0dff4ec commit 5e4f842

File tree

3 files changed

+99
-20
lines changed

3 files changed

+99
-20
lines changed

zjit/src/backend/arm64/mod.rs

Lines changed: 96 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use crate::asm::arm64::*;
55
use crate::codegen::split_patch_point;
66
use crate::cruby::*;
77
use crate::backend::lir::*;
8+
use crate::hir;
89
use crate::options::asm_dump;
910
use crate::stats::CompileError;
1011
use crate::virtualmem::CodePtr;
@@ -690,11 +691,52 @@ impl Assembler {
690691
asm_local
691692
}
692693

694+
/// Resolve ParallelMov instructions without using scratch registers.
695+
/// This is used for trampolines that don't allow scratch registers.
696+
/// Linearizes all blocks into a single giant block.
697+
fn resolve_parallel_mov_pass(self) -> Assembler {
698+
let mut asm_local = Assembler::new();
699+
asm_local.accept_scratch_reg = self.accept_scratch_reg;
700+
asm_local.stack_base_idx = self.stack_base_idx;
701+
asm_local.label_names = self.label_names.clone();
702+
asm_local.live_ranges.resize(self.live_ranges.len(), LiveRange { start: None, end: None });
703+
704+
// Create one giant block to linearize everything into
705+
asm_local.new_block(hir::BlockId(usize::MAX), true, true);
706+
707+
// Get linearized instructions with branch parameters expanded into ParallelMov
708+
let linearized_insns = self.linearize_instructions();
709+
710+
// Process each linearized instruction
711+
for insn in linearized_insns {
712+
match insn {
713+
Insn::ParallelMov { moves } => {
714+
// Resolve parallel moves without scratch register
715+
if let Some(resolved_moves) = Assembler::resolve_parallel_moves(&moves, None) {
716+
for (dst, src) in resolved_moves {
717+
asm_local.mov(dst, src);
718+
}
719+
} else {
720+
unreachable!("ParallelMov requires scratch register but scratch_reg is not allowed");
721+
}
722+
}
723+
_ => {
724+
// Strip branch args from jumps since everything is now in one block
725+
let insn = Self::strip_branch_args(&self, insn);
726+
asm_local.push_insn(insn);
727+
}
728+
}
729+
}
730+
731+
asm_local
732+
}
733+
693734
/// Split instructions using scratch registers. To maximize the use of the register pool for
694735
/// VRegs, most splits should happen in [`Self::arm64_split`]. However, some instructions
695736
/// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this
696737
/// splits them and uses scratch registers for it.
697-
fn arm64_scratch_split(mut self) -> Assembler {
738+
/// Linearizes all blocks into a single giant block.
739+
fn arm64_scratch_split(self) -> Assembler {
698740
/// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea.
699741
fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
700742
match opnd {
@@ -750,12 +792,23 @@ impl Assembler {
750792
// Prepare StackState to lower MemBase::Stack
751793
let stack_state = StackState::new(self.stack_base_idx);
752794

753-
let mut asm_local = Assembler::new_with_asm(&self);
795+
let mut asm_local = Assembler::new();
796+
asm_local.accept_scratch_reg = true;
797+
asm_local.stack_base_idx = self.stack_base_idx;
798+
asm_local.label_names = self.label_names.clone();
799+
asm_local.live_ranges.resize(self.live_ranges.len(), LiveRange { start: None, end: None });
800+
801+
// Create one giant block to linearize everything into
802+
asm_local.new_block(hir::BlockId(usize::MAX), true, true);
803+
754804
let asm = &mut asm_local;
755-
asm.accept_scratch_reg = true;
756-
let iterator = &mut self.instruction_iterator();
757805

758-
while let Some((_, mut insn)) = iterator.next(asm) {
806+
// Get linearized instructions with branch parameters expanded into ParallelMov
807+
let linearized_insns = self.linearize_instructions();
808+
809+
// Process each linearized instruction
810+
for (idx, insn) in linearized_insns.iter().enumerate() {
811+
let mut insn = insn.clone();
759812
match &mut insn {
760813
Insn::Add { left, right, out } |
761814
Insn::Sub { left, right, out } |
@@ -795,7 +848,7 @@ impl Assembler {
795848
};
796849

797850
// If the next instruction is JoMul
798-
if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) {
851+
if idx + 1 < linearized_insns.len() && matches!(linearized_insns[idx + 1], Insn::JoMul(_)) {
799852
// Produce a register that is all zeros or all ones
800853
// Based on the sign bit of the 64-bit mul result
801854
asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
@@ -907,6 +960,8 @@ impl Assembler {
907960
split_patch_point(asm, target, invariant, version);
908961
}
909962
_ => {
963+
// Strip branch args from jumps since everything is now in one block
964+
let insn = Self::strip_branch_args(&self, insn);
910965
asm.push_insn(insn);
911966
}
912967
}
@@ -915,6 +970,28 @@ impl Assembler {
915970
asm_local
916971
}
917972

973+
/// Strip branch arguments from a jump instruction, converting Target::Block(edge) with args
974+
/// into Target::Label. This is needed after linearization since all blocks are merged into one.
975+
fn strip_branch_args(asm: &Assembler, insn: Insn) -> Insn {
976+
match insn {
977+
Insn::Jmp(Target::Block(edge)) => Insn::Jmp(Target::Label(asm.block_label(edge.target))),
978+
Insn::Jz(Target::Block(edge)) => Insn::Jz(Target::Label(asm.block_label(edge.target))),
979+
Insn::Jnz(Target::Block(edge)) => Insn::Jnz(Target::Label(asm.block_label(edge.target))),
980+
Insn::Je(Target::Block(edge)) => Insn::Je(Target::Label(asm.block_label(edge.target))),
981+
Insn::Jne(Target::Block(edge)) => Insn::Jne(Target::Label(asm.block_label(edge.target))),
982+
Insn::Jl(Target::Block(edge)) => Insn::Jl(Target::Label(asm.block_label(edge.target))),
983+
Insn::Jg(Target::Block(edge)) => Insn::Jg(Target::Label(asm.block_label(edge.target))),
984+
Insn::Jge(Target::Block(edge)) => Insn::Jge(Target::Label(asm.block_label(edge.target))),
985+
Insn::Jbe(Target::Block(edge)) => Insn::Jbe(Target::Label(asm.block_label(edge.target))),
986+
Insn::Jb(Target::Block(edge)) => Insn::Jb(Target::Label(asm.block_label(edge.target))),
987+
Insn::Jo(Target::Block(edge)) => Insn::Jo(Target::Label(asm.block_label(edge.target))),
988+
Insn::JoMul(Target::Block(edge)) => Insn::JoMul(Target::Label(asm.block_label(edge.target))),
989+
Insn::Joz(opnd, Target::Block(edge)) => Insn::Joz(opnd, Target::Label(asm.block_label(edge.target))),
990+
Insn::Jonz(opnd, Target::Block(edge)) => Insn::Jonz(opnd, Target::Label(asm.block_label(edge.target))),
991+
_ => insn,
992+
}
993+
}
994+
918995
/// Emit platform-specific machine code
919996
/// Returns a list of GC offsets. Can return failure to signal caller to retry.
920997
fn arm64_emit(&mut self, cb: &mut CodeBlock) -> Option<Vec<CodePtr>> {
@@ -1112,8 +1189,11 @@ impl Assembler {
11121189
let (_hook, mut hook_insn_idx) = AssemblerPanicHook::new(self, 0);
11131190

11141191
// For each instruction
1192+
// NOTE: At this point, the assembler should have been linearized into a single giant block
1193+
// by either resolve_parallel_mov_pass() or arm64_scratch_split().
11151194
let mut insn_idx: usize = 0;
1116-
let insns = self.linearize_instructions();
1195+
assert_eq!(self.basic_blocks.len(), 1, "Assembler should be linearized into a single block before arm64_emit");
1196+
let insns = &self.basic_blocks[0].insns;
11171197

11181198
while let Some(insn) = insns.get(insn_idx) {
11191199
// Update insn_idx that is shown on panic
@@ -1610,7 +1690,7 @@ impl Assembler {
16101690
/// Optimize and compile the stored instructions
16111691
pub fn compile_with_regs(mut self, cb: &mut CodeBlock, regs: Vec<Reg>) -> Result<(CodePtr, Vec<CodePtr>), CompileError> {
16121692
// The backend is allowed to use scratch registers only if it has not accepted them so far.
1613-
let use_scratch_reg = !self.accept_scratch_reg;
1693+
let scratch_reg_allowed = !self.accept_scratch_reg;
16141694

16151695
// Initialize block labels before any processing
16161696
self.init_block_labels();
@@ -1619,16 +1699,23 @@ impl Assembler {
16191699
let asm = self.arm64_split();
16201700
asm_dump!(asm, split);
16211701

1702+
// Linearize before here?
16221703
let mut asm = asm.alloc_regs(regs)?;
16231704
asm_dump!(asm, alloc_regs);
16241705

16251706
// We put compile_exits after alloc_regs to avoid extending live ranges for VRegs spilled on side exits.
16261707
asm.compile_exits();
16271708
asm_dump!(asm, compile_exits);
16281709

1629-
if use_scratch_reg {
1710+
// linearize
1711+
//
1712+
if scratch_reg_allowed {
16301713
asm = asm.arm64_scratch_split();
16311714
asm_dump!(asm, scratch_split);
1715+
} else {
1716+
// For trampolines that use scratch registers, resolve ParallelMov without scratch_reg.
1717+
asm = asm.resolve_parallel_mov_pass();
1718+
asm_dump!(asm, resolve_parallel_mov);
16321719
}
16331720

16341721
// Create label instances in the code block

zjit/src/backend/lir.rs

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1997,17 +1997,6 @@ impl Assembler
19971997
// Push instruction(s)
19981998
let is_ccall = matches!(insn, Insn::CCall { .. });
19991999
match insn {
2000-
Insn::ParallelMov { moves } => {
2001-
// For trampolines that use scratch registers, attempt to lower ParallelMov without scratch_reg.
2002-
if let Some(moves) = Self::resolve_parallel_moves(&moves, None) {
2003-
for (dst, src) in moves {
2004-
asm.mov(dst, src);
2005-
}
2006-
} else {
2007-
// If it needs a scratch_reg, leave it to *_split_with_scratch_regs to handle it.
2008-
asm.push_insn(Insn::ParallelMov { moves });
2009-
}
2010-
}
20112000
Insn::CCall { opnds, fptr, start_marker, end_marker, out } => {
20122001
// Split start_marker and end_marker here to avoid inserting push/pop between them.
20132002
if let Some(start_marker) = start_marker {

zjit/src/options.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ pub enum DumpLIR {
180180
alloc_regs,
181181
/// Dump LIR after compile_exits
182182
compile_exits,
183+
/// Dump LIR after resolve_parallel_mov
184+
resolve_parallel_mov,
183185
/// Dump LIR after {arch}_scratch_split
184186
scratch_split,
185187
}
@@ -190,6 +192,7 @@ const DUMP_LIR_ALL: &[DumpLIR] = &[
190192
DumpLIR::split,
191193
DumpLIR::alloc_regs,
192194
DumpLIR::compile_exits,
195+
DumpLIR::resolve_parallel_mov,
193196
DumpLIR::scratch_split,
194197
];
195198

0 commit comments

Comments
 (0)