Skip to content

Commit 62dfbd6

Browse files
authored
Cranelift: use SP-offset amodes for stack_addr+load/store. (#11727)
We provide `stack_load`/ `stack_store` / `stack_addr` instructions in Cranelift to operate on stack slots, and the first two are legalized to a `stack_addr` plus an ordinary load or store instruction. We currently have lowerings for `stack_addr` that materialize an SP-relative address into a register: for example, `leaq 8(%rsp), %rax` on x86-64 or `add x0, sp, #8` on aarch64. Taken together, we see sequences like (aarch64 / x86-64) ``` add x0, sp, #8 / leaq 8(%rsp), %rax str x1, [x0] / movq %rdx, (%rax) ``` when using `stack_store`s. In particular, we do *not* use the direct SP-relative form, which would look like ``` str x1, [sp, #8] / movq %rdx, 8(%rsp) ``` and which we can already generate in other cases, e.g. spillslot moves (spills/reloads) and clobber saves/restores. This inefficiency is undesirable whenever the embedder is using stackslots, but in particular when we expect to have high memory traffic to stack slots (e.g., I am seeing this now when implementing debug instrumentation in Wasmtime, and user stack map instrumentation for GC will also benefit). This PR adds new lowerings that use the existing synthetic address mode we already use for spillslots to emit loads/stores to stackslots directly when possible. The PR does this for x86-64 and aarch64; others could be updated later.
1 parent f16de09 commit 62dfbd6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+451
-335
lines changed

cranelift/codegen/src/isa/aarch64/inst.isle

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3899,6 +3899,11 @@
38993899
(if-let new_offset (i32_checked_add x offset))
39003900
(amode_no_more_iconst ty y new_offset))
39013901

3902+
(rule 3
3903+
(amode ty (stack_addr slot offset1) offset2)
3904+
(AMode.SlotOffset
3905+
(abi_stackslot_offset_into_slot_region slot offset1 offset2)))
3906+
39023907
(decl amode_no_more_iconst (Type Value i32) AMode)
39033908
;; Base case: move the `offset` into a register and add it to `val` via the
39043909
;; amode

cranelift/codegen/src/isa/x64/inst.isle

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,15 @@
403403
(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
404404
(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
405405

406+
(decl synthetic_amode_slot (i32) SyntheticAmode)
407+
(extern constructor synthetic_amode_slot synthetic_amode_slot)
408+
409+
;; Helper for loads/stores to/from stackslots.
410+
(decl stackslot_amode (StackSlot Offset32 Offset32) SyntheticAmode)
411+
(rule (stackslot_amode slot offset1 offset2)
412+
(let ((slot_offset i32 (abi_stackslot_offset_into_slot_region slot offset1 offset2)))
413+
(synthetic_amode_slot slot_offset)))
414+
406415
;; An `Amode` represents a possible addressing mode that can be used
407416
;; in instructions. These denote a 64-bit value only.
408417
(type Amode (enum
@@ -494,12 +503,16 @@
494503
(provide (= result (concat flags (bvadd val (sign_ext 64 offset)))))
495504
(require
496505
(= (widthof val) 64)))
497-
(decl to_amode (MemFlags Value Offset32) Amode)
506+
(decl to_amode (MemFlags Value Offset32) SyntheticAmode)
498507
(rule 0 (to_amode flags base offset)
499508
(amode_imm_reg flags base offset))
500509
(rule 1 (to_amode flags (iadd x y) offset)
501510
(to_amode_add flags x y offset))
502511

512+
(rule 2
513+
(to_amode flags (stack_addr slot offset1) offset2)
514+
(stackslot_amode slot offset1 offset2))
515+
503516
;; Same as `to_amode`, except that the base address is computed via the addition
504517
;; of the two `Value` arguments provided.
505518
;;
@@ -572,7 +585,7 @@
572585

573586
;; Offsetting an Amode. Used when we need to do consecutive
574587
;; loads/stores to adjacent addresses.
575-
(decl amode_offset (Amode i32) Amode)
588+
(decl amode_offset (SyntheticAmode i32) SyntheticAmode)
576589
(extern constructor amode_offset amode_offset)
577590

578591
;; Return a zero offset as an `Offset32`.
@@ -1368,7 +1381,7 @@
13681381
(rule 0 (x64_load (multi_lane _bits _lanes) addr _ext_kind)
13691382
(x64_movdqu_load addr))
13701383

1371-
(decl x64_mov (Amode) Reg)
1384+
(decl x64_mov (SyntheticAmode) Reg)
13721385
(spec (x64_mov addr)
13731386
(provide (= result (conv_to 64 (load_effect (extract 79 64 addr) 64 (extract 63 0 addr))))))
13741387
(rule (x64_mov addr) (x64_movq_rm addr))
@@ -2701,25 +2714,25 @@
27012714
(decl x64_pextrb (Xmm u8) Gpr)
27022715
(rule (x64_pextrb src lane) (x64_pextrb_a_or_avx src lane))
27032716

2704-
(decl x64_pextrb_store (Amode Xmm u8) SideEffectNoResult)
2717+
(decl x64_pextrb_store (SyntheticAmode Xmm u8) SideEffectNoResult)
27052718
(rule (x64_pextrb_store addr src lane) (x64_pextrb_a_mem_or_avx addr src lane))
27062719

27072720
(decl x64_pextrw (Xmm u8) Gpr)
27082721
(rule (x64_pextrw src lane) (x64_pextrw_a_or_avx src lane))
27092722

2710-
(decl x64_pextrw_store (Amode Xmm u8) SideEffectNoResult)
2723+
(decl x64_pextrw_store (SyntheticAmode Xmm u8) SideEffectNoResult)
27112724
(rule (x64_pextrw_store addr src lane) (x64_pextrw_b_mem_or_avx addr src lane))
27122725

27132726
(decl x64_pextrd (Xmm u8) Gpr)
27142727
(rule (x64_pextrd src lane) (x64_pextrd_a_or_avx src lane))
27152728

2716-
(decl x64_pextrd_store (Amode Xmm u8) SideEffectNoResult)
2729+
(decl x64_pextrd_store (SyntheticAmode Xmm u8) SideEffectNoResult)
27172730
(rule (x64_pextrd_store addr src lane) (x64_pextrd_a_mem_or_avx addr src lane))
27182731

27192732
(decl x64_pextrq (Xmm u8) Gpr)
27202733
(rule (x64_pextrq src lane) (x64_pextrq_a_or_avx src lane))
27212734

2722-
(decl x64_pextrq_store (Amode Xmm u8) SideEffectNoResult)
2735+
(decl x64_pextrq_store (SyntheticAmode Xmm u8) SideEffectNoResult)
27232736
(rule (x64_pextrq_store addr src lane) (x64_pextrq_a_mem_or_avx addr src lane))
27242737

27252738
;; Helper for creating `pmovmskb` instructions.
@@ -3181,7 +3194,7 @@
31813194
)
31823195
(x64_por low_gt_and_high_eq high_halves_gt)))
31833196

3184-
(decl x64_add_mem (Type Amode Value) SideEffectNoResult)
3197+
(decl x64_add_mem (Type SyntheticAmode Value) SideEffectNoResult)
31853198
(spec (x64_add_mem ty addr val)
31863199
(provide (= result (store_effect
31873200
(extract 79 64 addr)
@@ -3207,7 +3220,7 @@
32073220
(rule 2 (x64_add_mem $I32 addr (i8_from_iconst val)) (x64_addl_mi_sxb_mem addr val))
32083221
(rule 2 (x64_add_mem $I64 addr (i8_from_iconst val)) (x64_addq_mi_sxb_mem addr val))
32093222

3210-
(decl x64_sub_mem (Type Amode Value) SideEffectNoResult)
3223+
(decl x64_sub_mem (Type SyntheticAmode Value) SideEffectNoResult)
32113224

32123225
;; `sub mem, reg`
32133226
(rule 0 (x64_sub_mem $I8 addr val) (x64_subb_mr_mem addr val))
@@ -3223,7 +3236,7 @@
32233236
(rule 2 (x64_sub_mem $I32 addr (i8_from_iconst val)) (x64_subl_mi_sxb_mem addr val))
32243237
(rule 2 (x64_sub_mem $I64 addr (i8_from_iconst val)) (x64_subq_mi_sxb_mem addr val))
32253238

3226-
(decl x64_and_mem (Type Amode Value) SideEffectNoResult)
3239+
(decl x64_and_mem (Type SyntheticAmode Value) SideEffectNoResult)
32273240

32283241
;; `and mem, imm`
32293242
(rule (x64_and_mem $I8 addr val) (x64_andb_mr_mem addr val))
@@ -3241,7 +3254,7 @@
32413254
(rule 2 (x64_and_mem $I32 addr (i8_from_iconst val)) (x64_andl_mi_sxb_mem addr val))
32423255
(rule 2 (x64_and_mem $I64 addr (i8_from_iconst val)) (x64_andq_mi_sxb_mem addr val))
32433256

3244-
(decl x64_or_mem (Type Amode Value) SideEffectNoResult)
3257+
(decl x64_or_mem (Type SyntheticAmode Value) SideEffectNoResult)
32453258

32463259
;; `or mem, reg`
32473260
(rule 0 (x64_or_mem $I8 addr val) (x64_orb_mr_mem addr val))
@@ -3259,7 +3272,7 @@
32593272
(rule 2 (x64_or_mem $I32 addr (i8_from_iconst val)) (x64_orl_mi_sxb_mem addr val))
32603273
(rule 2 (x64_or_mem $I64 addr (i8_from_iconst val)) (x64_orq_mi_sxb_mem addr val))
32613274

3262-
(decl x64_xor_mem (Type Amode Value) SideEffectNoResult)
3275+
(decl x64_xor_mem (Type SyntheticAmode Value) SideEffectNoResult)
32633276

32643277
;; `xor mem, reg`
32653278
(rule 0 (x64_xor_mem $I8 addr val) (x64_xorb_mr_mem addr val))
@@ -3644,31 +3657,31 @@
36443657
(rule (x64_xchg $I32 addr operand) (x64_xchgl_rm operand addr))
36453658
(rule (x64_xchg $I64 addr operand) (x64_xchgq_rm operand addr))
36463659

3647-
(decl x64_lock_add (OperandSize Amode Gpr) SideEffectNoResult)
3660+
(decl x64_lock_add (OperandSize SyntheticAmode Gpr) SideEffectNoResult)
36483661
(rule (x64_lock_add (OperandSize.Size8) addr reg) (x64_lock_addb_mr_mem addr reg))
36493662
(rule (x64_lock_add (OperandSize.Size16) addr reg) (x64_lock_addw_mr_mem addr reg))
36503663
(rule (x64_lock_add (OperandSize.Size32) addr reg) (x64_lock_addl_mr_mem addr reg))
36513664
(rule (x64_lock_add (OperandSize.Size64) addr reg) (x64_lock_addq_mr_mem addr reg))
36523665

3653-
(decl x64_lock_sub (OperandSize Amode Gpr) SideEffectNoResult)
3666+
(decl x64_lock_sub (OperandSize SyntheticAmode Gpr) SideEffectNoResult)
36543667
(rule (x64_lock_sub (OperandSize.Size8) addr reg) (x64_lock_subb_mr_mem addr reg))
36553668
(rule (x64_lock_sub (OperandSize.Size16) addr reg) (x64_lock_subw_mr_mem addr reg))
36563669
(rule (x64_lock_sub (OperandSize.Size32) addr reg) (x64_lock_subl_mr_mem addr reg))
36573670
(rule (x64_lock_sub (OperandSize.Size64) addr reg) (x64_lock_subq_mr_mem addr reg))
36583671

3659-
(decl x64_lock_and (OperandSize Amode Gpr) SideEffectNoResult)
3672+
(decl x64_lock_and (OperandSize SyntheticAmode Gpr) SideEffectNoResult)
36603673
(rule (x64_lock_and (OperandSize.Size8) addr reg) (x64_lock_andb_mr_mem addr reg))
36613674
(rule (x64_lock_and (OperandSize.Size16) addr reg) (x64_lock_andw_mr_mem addr reg))
36623675
(rule (x64_lock_and (OperandSize.Size32) addr reg) (x64_lock_andl_mr_mem addr reg))
36633676
(rule (x64_lock_and (OperandSize.Size64) addr reg) (x64_lock_andq_mr_mem addr reg))
36643677

3665-
(decl x64_lock_or (OperandSize Amode Gpr) SideEffectNoResult)
3678+
(decl x64_lock_or (OperandSize SyntheticAmode Gpr) SideEffectNoResult)
36663679
(rule (x64_lock_or (OperandSize.Size8) addr reg) (x64_lock_orb_mr_mem addr reg))
36673680
(rule (x64_lock_or (OperandSize.Size16) addr reg) (x64_lock_orw_mr_mem addr reg))
36683681
(rule (x64_lock_or (OperandSize.Size32) addr reg) (x64_lock_orl_mr_mem addr reg))
36693682
(rule (x64_lock_or (OperandSize.Size64) addr reg) (x64_lock_orq_mr_mem addr reg))
36703683

3671-
(decl x64_lock_xor (OperandSize Amode Gpr) SideEffectNoResult)
3684+
(decl x64_lock_xor (OperandSize SyntheticAmode Gpr) SideEffectNoResult)
36723685
(rule (x64_lock_xor (OperandSize.Size8) addr reg) (x64_lock_xorb_mr_mem addr reg))
36733686
(rule (x64_lock_xor (OperandSize.Size16) addr reg) (x64_lock_xorw_mr_mem addr reg))
36743687
(rule (x64_lock_xor (OperandSize.Size32) addr reg) (x64_lock_xorl_mr_mem addr reg))

cranelift/codegen/src/isa/x64/inst/args.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,21 @@ impl SyntheticAmode {
573573
| SyntheticAmode::ConstantOffset { .. } => true,
574574
}
575575
}
576+
577+
/// Offset the synthetic amode by a fixed offset.
578+
pub(crate) fn offset(&self, offset: i32) -> Self {
579+
let mut ret = self.clone();
580+
match &mut ret {
581+
SyntheticAmode::Real(amode) => *amode = amode.offset(offset),
582+
SyntheticAmode::SlotOffset { simm32 } => *simm32 += offset,
583+
// `amode_offset` is used only in i128.load/store which
584+
// takes a synthetic amode from `to_amode`; `to_amode` can
585+
// only produce Real or SlotOffset amodes, never
586+
// IncomingArg or ConstantOffset.
587+
_ => panic!("Cannot offset SyntheticAmode: {self:?}"),
588+
}
589+
ret
590+
}
576591
}
577592

578593
impl From<Amode> for SyntheticAmode {

cranelift/codegen/src/isa/x64/lower.isle

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3056,8 +3056,8 @@
30563056
;; We can load an I128 by doing two 64-bit loads.
30573057
(rule -3 (lower (has_type $I128
30583058
(load (little_or_native_endian flags) address offset)))
3059-
(let ((addr_lo Amode (to_amode flags address offset))
3060-
(addr_hi Amode (amode_offset addr_lo 8))
3059+
(let ((addr_lo SyntheticAmode (to_amode flags address offset))
3060+
(addr_hi SyntheticAmode (amode_offset addr_lo 8))
30613061
(value_lo Reg (x64_mov addr_lo))
30623062
(value_hi Reg (x64_mov addr_hi)))
30633063
(value_regs value_lo value_hi)))
@@ -3187,8 +3187,8 @@
31873187
(let ((value_reg ValueRegs value)
31883188
(value_lo Gpr (value_regs_get_gpr value_reg 0))
31893189
(value_hi Gpr (value_regs_get_gpr value_reg 1))
3190-
(addr_lo Amode (to_amode flags address offset))
3191-
(addr_hi Amode (amode_offset addr_lo 8)))
3190+
(addr_lo SyntheticAmode (to_amode flags address offset))
3191+
(addr_hi SyntheticAmode (amode_offset addr_lo 8)))
31923192
(side_effect
31933193
(side_effect_concat
31943194
(x64_movrm $I64 addr_lo value_lo)

cranelift/codegen/src/isa/x64/lower/isle.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
409409
amode.clone().into()
410410
}
411411

412+
#[inline]
413+
fn synthetic_amode_slot(&mut self, offset: i32) -> SyntheticAmode {
414+
SyntheticAmode::SlotOffset { simm32: offset }
415+
}
416+
412417
#[inline]
413418
fn const_to_synthetic_amode(&mut self, c: VCodeConstant) -> SyntheticAmode {
414419
SyntheticAmode::ConstantOffset(c)
@@ -633,7 +638,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
633638
}
634639

635640
#[inline]
636-
fn amode_offset(&mut self, addr: &Amode, offset: i32) -> Amode {
641+
fn amode_offset(&mut self, addr: &SyntheticAmode, offset: i32) -> SyntheticAmode {
637642
addr.offset(offset)
638643
}
639644

cranelift/codegen/src/machinst/abi.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2139,6 +2139,11 @@ impl<M: ABIMachineSpec> Callee<M> {
21392139
}
21402140
}
21412141

2142+
/// Get the raw offset of a sized stackslot in the slot region.
2143+
pub fn sized_stackslot_offset(&self, slot: StackSlot) -> u32 {
2144+
self.sized_stackslots[slot]
2145+
}
2146+
21422147
/// Produce an instruction that computes a sized stackslot address.
21432148
pub fn sized_stackslot_addr(
21442149
&self,

cranelift/codegen/src/machinst/isle.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,22 @@ macro_rules! isle_lower_prelude_methods {
507507
.into()
508508
}
509509

510+
fn abi_stackslot_offset_into_slot_region(
511+
&mut self,
512+
stack_slot: StackSlot,
513+
offset1: Offset32,
514+
offset2: Offset32,
515+
) -> i32 {
516+
let offset1 = i32::from(offset1);
517+
let offset2 = i32::from(offset2);
518+
i32::try_from(self.lower_ctx.abi().sized_stackslot_offset(stack_slot))
519+
.expect("Stack slot region cannot be larger than 2GiB")
520+
.checked_add(offset1)
521+
.expect("Stack slot region cannot be larger than 2GiB")
522+
.checked_add(offset2)
523+
.expect("Stack slot region cannot be larger than 2GiB")
524+
}
525+
510526
fn abi_dynamic_stackslot_addr(
511527
&mut self,
512528
dst: WritableReg,

cranelift/codegen/src/prelude_lower.isle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,6 +1119,10 @@
11191119
(decl abi_stackslot_addr (WritableReg StackSlot Offset32) MInst)
11201120
(extern constructor abi_stackslot_addr abi_stackslot_addr)
11211121

1122+
;; StackSlot raw offset into slot region
1123+
(decl abi_stackslot_offset_into_slot_region (StackSlot Offset32 Offset32) i32)
1124+
(extern constructor abi_stackslot_offset_into_slot_region abi_stackslot_offset_into_slot_region)
1125+
11221126
;; DynamicStackSlot addr
11231127
(decl abi_dynamic_stackslot_addr (WritableReg DynamicStackSlot) MInst)
11241128
(extern constructor abi_dynamic_stackslot_addr abi_dynamic_stackslot_addr)

cranelift/filetests/filetests/isa/aarch64/dynamic-slot.clif

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ block0:
1616
; mov fp, sp
1717
; sub sp, sp, #16
1818
; block0:
19-
; movz x1, #1
20-
; mov x2, sp
21-
; str x1, [x2]
19+
; movz x0, #1
20+
; str x0, [sp]
2221
; add sp, sp, #16
2322
; ldp fp, lr, [sp], #16
2423
; ret
@@ -29,9 +28,8 @@ block0:
2928
; mov x29, sp
3029
; sub sp, sp, #0x10
3130
; block1: ; offset 0xc
32-
; mov x1, #1
33-
; mov x2, sp
34-
; str x1, [x2]
31+
; mov x0, #1
32+
; stur x0, [sp]
3533
; add sp, sp, #0x10
3634
; ldp x29, x30, [sp], #0x10
3735
; ret
@@ -51,9 +49,8 @@ block0:
5149
; mov fp, sp
5250
; sub sp, sp, #16
5351
; block0:
54-
; movz x1, #1
55-
; mov x2, sp
56-
; str x1, [x2]
52+
; movz x0, #1
53+
; str x0, [sp]
5754
; add sp, sp, #16
5855
; ldp fp, lr, [sp], #16
5956
; ret
@@ -64,9 +61,8 @@ block0:
6461
; mov x29, sp
6562
; sub sp, sp, #0x10
6663
; block1: ; offset 0xc
67-
; mov x1, #1
68-
; mov x2, sp
69-
; str x1, [x2]
64+
; mov x0, #1
65+
; stur x0, [sp]
7066
; add sp, sp, #0x10
7167
; ldp x29, x30, [sp], #0x10
7268
; ret

0 commit comments

Comments
 (0)