Skip to content

Commit f6f447b

Browse files
authored
pulley: Add macro CallN instructions (bytecodealliance#9874)
* pulley: Add macro `CallN` instructions This commit adds new macro instructions to assist with speeding up calls between functions. Pulley's previous `Call` instruction was similar to native call instructions where arguments/results are implicitly in the right location according to the ABI, but movement between registers is more expensive with Pulley than with native architectures. The `CallN` instructions here enable listing a few arguments (only integer registers) in the opcode itself. This removes the need for individual `xmov` instructions into individual registers and instead it can all be done within the opcode handlers. This additionally enables passing the same argument twice to a function to reside only in one register. Finally parallel-copies between these registers are supported as the interpreter loads all registers and then stores all registers. These new instructions participate in register allocation differently from before where the first few arguments are allowed to be in any register and no longer use `reg_fixed_use`. All other arguments (and all float arguments for example) continue to use `reg_fixed_use`. Locally sightglass reports this change speeding up `pulldown-cmark` by 2-10%. On a `fib(N)` micro-benchmark it didn't help as much as I was hoping that it was going to. * Fix MSRV
1 parent d621b45 commit f6f447b

File tree

15 files changed

+293
-144
lines changed

15 files changed

+293
-144
lines changed

cranelift/codegen/meta/src/pulley.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ impl Inst<'_> {
8989
match self.name {
9090
// Skip instructions related to control-flow as those require
9191
// special handling with `MachBuffer`.
92-
"Jump" | "Call" | "CallIndirect" => true,
92+
"Jump" => true,
93+
n if n.starts_with("Call") => true,
9394

9495
// Skip special instructions not used in Cranelift.
9596
"XPush32Many" | "XPush64Many" | "XPop32Many" | "XPop64Many" => true,

cranelift/codegen/src/isa/pulley_shared/abi.rs

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -441,15 +441,45 @@ where
441441
fn gen_call(
442442
dest: &CallDest,
443443
_tmp: Writable<Reg>,
444-
info: CallInfo<()>,
444+
mut info: CallInfo<()>,
445445
) -> SmallVec<[Self::I; 2]> {
446446
match dest {
447447
// "near" calls are pulley->pulley calls so they use a normal "call"
448448
// opcode
449-
CallDest::ExtName(name, RelocDistance::Near) => smallvec![Inst::Call {
450-
info: Box::new(info.map(|()| name.clone()))
449+
CallDest::ExtName(name, RelocDistance::Near) => {
450+
// The first four integer arguments to a call can be handled via
451+
// special pulley call instructions. Assert here that
452+
// `info.uses` is sorted in order and then take out x0-x3 if
453+
// they're present and move them from `info.uses` to
454+
// `info.dest.args` to be handled differently during register
455+
// allocation.
456+
let mut args = SmallVec::new();
457+
if cfg!(debug_assertions) {
458+
let xargs = info
459+
.uses
460+
.iter()
461+
.filter_map(|a| XReg::new(a.preg))
462+
.collect::<Vec<_>>();
463+
for window in xargs.windows(2) {
464+
assert!(window[0] < window[1]);
465+
}
466+
}
467+
info.uses.retain(|arg| {
468+
if arg.preg != x0() && arg.preg != x1() && arg.preg != x2() && arg.preg != x3()
469+
{
470+
return true;
471+
}
472+
args.push(XReg::new(arg.vreg).unwrap());
473+
false
474+
});
475+
smallvec![Inst::Call {
476+
info: Box::new(info.map(|()| PulleyCall {
477+
name: name.clone(),
478+
args,
479+
}))
480+
}
481+
.into()]
451482
}
452-
.into()],
453483
// "far" calls are pulley->host calls so they use a different opcode
454484
// which is lowered with a special relocation in the backend.
455485
CallDest::ExtName(name, RelocDistance::Far) => smallvec![Inst::IndirectCallHost {

cranelift/codegen/src/isa/pulley_shared/inst.isle

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858

5959
;; An indirect call out to a host-defined function. The host function
6060
;; pointer is the first "argument" of this function call.
61-
(IndirectCallHost (info BoxCallInfo))
61+
(IndirectCallHost (info BoxCallIndirectHostInfo))
6262

6363
;; Unconditional jumps.
6464
(Jump (label MachLabel))
@@ -154,6 +154,7 @@
154154
(type BoxReturnCallInfo (primitive BoxReturnCallInfo))
155155
(type BoxReturnCallIndInfo (primitive BoxReturnCallIndInfo))
156156
(type XRegSet (primitive XRegSet))
157+
(type BoxCallIndirectHostInfo (primitive BoxCallIndirectHostInfo))
157158

158159
;;;; Address Modes ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
159160

cranelift/codegen/src/isa/pulley_shared/inst/args.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! Pulley instruction arguments.
22
33
use super::*;
4+
use crate::ir::ExternalName;
45
use crate::machinst::abi::StackAMode;
56
use pulley_interpreter::encode;
67
use pulley_interpreter::regs::Reg as _;
@@ -565,3 +566,15 @@ impl fmt::Display for Cond {
565566
}
566567
}
567568
}
569+
570+
/// Payload of `CallInfo` for call instructions
571+
#[derive(Clone, Debug)]
572+
pub struct PulleyCall {
573+
/// The external name that's being called, or the Cranelift-generated
574+
/// function that's being invoked.
575+
pub name: ExternalName,
576+
/// Arguments tracked in this call invocation which aren't assigned fixed
577+
/// registers. This tracks up to 4 registers and all remaining registers
578+
/// will be present and tracked in `CallInfo<T>` fields.
579+
pub args: SmallVec<[XReg; 4]>,
580+
}

cranelift/codegen/src/isa/pulley_shared/inst/emit.rs

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -172,16 +172,36 @@ fn pulley_emit<P>(
172172
Inst::LoadExtName { .. } => todo!(),
173173

174174
Inst::Call { info } => {
175-
sink.put1(pulley_interpreter::Opcode::Call as u8);
176-
sink.add_reloc(
175+
let offset = sink.cur_offset();
176+
177+
// If arguments happen to already be in the right register for the
178+
// ABI then remove them from this list. Otherwise emit the
179+
// appropriate `Call` instruction depending on how many arguments we
180+
// have that aren't already in their correct register according to
181+
// ABI conventions.
182+
let mut args = &info.dest.args[..];
183+
while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) {
184+
args = &args[..args.len() - 1];
185+
}
186+
match args {
187+
[] => enc::call(sink, 0),
188+
[x0] => enc::call1(sink, x0, 0),
189+
[x0, x1] => enc::call2(sink, x0, x1, 0),
190+
[x0, x1, x2] => enc::call3(sink, x0, x1, x2, 0),
191+
[x0, x1, x2, x3] => enc::call4(sink, x0, x1, x2, x3, 0),
192+
_ => unreachable!(),
193+
}
194+
let end = sink.cur_offset();
195+
sink.add_reloc_at_offset(
196+
end - 4,
177197
// TODO: is it actually okay to reuse this reloc here?
178198
Reloc::X86CallPCRel4,
179-
&info.dest,
199+
&info.dest.name,
180200
// This addend adjusts for the difference between the start of
181-
// the instruction and the beginning of the immediate field.
182-
-1,
201+
// the instruction and the beginning of the immediate offset
202+
// field which is always the final 4 bytes of the instruction.
203+
-i64::from(end - offset - 4),
183204
);
184-
sink.put4(0);
185205
if let Some(s) = state.take_stack_map() {
186206
let offset = sink.cur_offset();
187207
sink.push_user_stack_map(state, offset, s);

cranelift/codegen/src/isa/pulley_shared/inst/mod.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,29 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) {
151151
collector.reg_def(dst);
152152
}
153153

154-
Inst::Call { info } | Inst::IndirectCallHost { info } => {
154+
Inst::Call { info } => {
155+
let CallInfo {
156+
uses, defs, dest, ..
157+
} = &mut **info;
158+
159+
// Pulley supports having the first few integer arguments in any
160+
// register, so flag that with `reg_use` here.
161+
let PulleyCall { args, .. } = dest;
162+
for arg in args {
163+
collector.reg_use(arg);
164+
}
165+
166+
// Remaining arguments (and return values) are all in fixed
167+
// registers according to Pulley's ABI, however.
168+
for CallArgPair { vreg, preg } in uses {
169+
collector.reg_fixed_use(vreg, *preg);
170+
}
171+
for CallRetPair { vreg, preg } in defs {
172+
collector.reg_fixed_def(vreg, *preg);
173+
}
174+
collector.reg_clobbers(info.clobbers);
175+
}
176+
Inst::IndirectCallHost { info } => {
155177
let CallInfo { uses, defs, .. } = &mut **info;
156178
for CallArgPair { vreg, preg } in uses {
157179
collector.reg_fixed_use(vreg, *preg);

cranelift/codegen/src/isa/pulley_shared/lower/isle.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ use crate::ir::{condcodes::*, immediates::*, types::*, *};
1010
use crate::isa::pulley_shared::{
1111
abi::*,
1212
inst::{
13-
FReg, OperandSize, ReturnCallInfo, VReg, WritableFReg, WritableVReg, WritableXReg, XReg,
13+
FReg, OperandSize, PulleyCall, ReturnCallInfo, VReg, WritableFReg, WritableVReg,
14+
WritableXReg, XReg,
1415
},
1516
lower::{regs, Cond},
1617
*,
@@ -26,8 +27,9 @@ use regalloc2::PReg;
2627
type Unit = ();
2728
type VecArgPair = Vec<ArgPair>;
2829
type VecRetPair = Vec<RetPair>;
29-
type BoxCallInfo = Box<CallInfo<ExternalName>>;
30+
type BoxCallInfo = Box<CallInfo<PulleyCall>>;
3031
type BoxCallIndInfo = Box<CallInfo<XReg>>;
32+
type BoxCallIndirectHostInfo = Box<CallInfo<ExternalName>>;
3133
type BoxReturnCallInfo = Box<ReturnCallInfo<ExternalName>>;
3234
type BoxReturnCallIndInfo = Box<ReturnCallInfo<XReg>>;
3335
type BoxExternalName = Box<ExternalName>;

cranelift/codegen/src/isa/s390x/inst/emit.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,13 @@ pub fn mem_emit(
220220
&MemArg::Symbol {
221221
ref name, offset, ..
222222
} => {
223-
sink.add_reloc_at_offset(2, Reloc::S390xPCRel32Dbl, &**name, (offset + 2).into());
223+
let reloc_offset = sink.cur_offset() + 2;
224+
sink.add_reloc_at_offset(
225+
reloc_offset,
226+
Reloc::S390xPCRel32Dbl,
227+
&**name,
228+
(offset + 2).into(),
229+
);
224230
put(sink, &enc_ril_b(opcode_ril.unwrap(), rd, 0));
225231
}
226232
_ => unreachable!(),
@@ -3198,7 +3204,8 @@ impl Inst {
31983204
// Add relocation for target function. This has to be done *before*
31993205
// the S390xTlsGdCall relocation if any, to ensure linker relaxation
32003206
// works correctly.
3201-
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
3207+
let offset = sink.cur_offset() + 2;
3208+
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
32023209

32033210
if let Some(s) = state.take_stack_map() {
32043211
let offset = sink.cur_offset() + 6;
@@ -3232,7 +3239,8 @@ impl Inst {
32323239
}
32333240

32343241
let opcode = 0xc04; // BCRL
3235-
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
3242+
let offset = sink.cur_offset() + 2;
3243+
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &info.dest, 2);
32363244
put(sink, &enc_ril_c(opcode, 15, 0));
32373245
sink.add_call_site();
32383246
}
@@ -3257,7 +3265,8 @@ impl Inst {
32573265
// *before* the S390xTlsGdCall, to ensure linker relaxation
32583266
// works correctly.
32593267
let dest = ExternalName::LibCall(LibCall::ElfTlsGetOffset);
3260-
sink.add_reloc_at_offset(2, Reloc::S390xPLTRel32Dbl, &dest, 2);
3268+
let offset = sink.cur_offset() + 2;
3269+
sink.add_reloc_at_offset(offset, Reloc::S390xPLTRel32Dbl, &dest, 2);
32613270
match &**symbol {
32623271
SymbolReloc::TlsGd { name } => sink.add_reloc(Reloc::S390xTlsGdCall, name, 0),
32633272
_ => unreachable!(),

cranelift/codegen/src/machinst/buffer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,7 @@ impl<I: VCodeInst> MachBuffer<I> {
15361536
}
15371537
}
15381538

1539-
/// Add an external relocation at the given offset from current offset.
1539+
/// Add an external relocation at the given offset.
15401540
pub fn add_reloc_at_offset<T: Into<RelocTarget> + Clone>(
15411541
&mut self,
15421542
offset: CodeOffset,
@@ -1579,7 +1579,7 @@ impl<I: VCodeInst> MachBuffer<I> {
15791579
// when a relocation can't otherwise be resolved later, so it shouldn't
15801580
// actually result in any memory unsafety or anything like that.
15811581
self.relocs.push(MachReloc {
1582-
offset: self.data.len() as CodeOffset + offset,
1582+
offset,
15831583
kind,
15841584
target,
15851585
addend,
@@ -1593,7 +1593,7 @@ impl<I: VCodeInst> MachBuffer<I> {
15931593
target: &T,
15941594
addend: Addend,
15951595
) {
1596-
self.add_reloc_at_offset(0, kind, target, addend);
1596+
self.add_reloc_at_offset(self.data.len() as CodeOffset, kind, target, addend);
15971597
}
15981598

15991599
/// Add a trap record at the current offset.

0 commit comments

Comments
 (0)