Skip to content

Commit d82ebcc

Browse files
authored
x64: Enable load-coalescing for SSE/AVX instructions (bytecodealliance#5841)
* x64: Enable load-coalescing for SSE/AVX instructions This commit unlocks the ability to fold loads into operands of SSE and AVX instructions. This is beneficial for both function size when it happens in addition to being able to reduce register pressure. Previously this was not done because most SSE instructions require memory to be aligned. AVX instructions, however, do not have alignment requirements. The solution implemented here is one recommended by Chris which is to add a new `XmmMemAligned` newtype wrapper around `XmmMem`. All SSE instructions are now annotated as requiring an `XmmMemAligned` operand except for a new new instruction styles used specifically for instructions that don't require alignment (e.g. `movdqu`, `*sd`, and `*ss` instructions). All existing instruction helpers continue to take `XmmMem`, however. This way if an AVX lowering is chosen it can be used as-is. If an SSE lowering is chosen, however, then an automatic conversion from `XmmMem` to `XmmMemAligned` kicks in. This automatic conversion only fails for unaligned addresses in which case a load instruction is emitted and the operand becomes a temporary register instead. A number of prior `Xmm` arguments have now been converted to `XmmMem` as well. One change from this commit is that loading an unaligned operand for an SSE instruction previously would use the "correct type" of load, e.g. `movups` for f32x4 or `movup` for f64x2, but now the loading happens in a context without type information so the `movdqu` instruction is generated. According to [this stack overflow question][question] it looks like modern processors won't penalize this "wrong" choice of type when the operand is then used for f32 or f64 oriented instructions. Finally this commit improves some reuse of logic in the `put_in_*_mem*` helper to share code with `sinkable_load` and avoid duplication. With this in place some various ISLE rules have been updated as well. In the tests it can be seen that AVX-instructions are now automatically load-coalesced and use memory operands in a few cases. [question]: https://stackoverflow.com/questions/40854819/is-there-any-situation-where-using-movdqu-and-movupd-is-better-than-movups * Fix tests * Fix move-and-extend to be unaligned These don't have alignment requirements like other xmm instructions as well. Additionally add some ISA tests to ensure that their output is tested. * Review comments
1 parent c65de1f commit d82ebcc

File tree

11 files changed

+634
-313
lines changed

11 files changed

+634
-313
lines changed

cranelift/codegen/src/isa/x64/inst.isle

Lines changed: 133 additions & 73 deletions
Large diffs are not rendered by default.

cranelift/codegen/src/isa/x64/inst/args.rs

Lines changed: 119 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ macro_rules! newtype_of_reg {
3131
$newtype_reg:ident,
3232
$newtype_writable_reg:ident,
3333
$newtype_option_writable_reg:ident,
34-
$newtype_reg_mem:ident,
35-
$newtype_reg_mem_imm:ident,
34+
reg_mem: ($($newtype_reg_mem:ident $(aligned:$aligned:ident)?),*),
35+
reg_mem_imm: ($($newtype_reg_mem_imm:ident $(aligned:$aligned_imm:ident)?),*),
3636
$newtype_imm8_reg:ident,
3737
|$check_reg:ident| $check:expr
3838
) => {
@@ -102,108 +102,130 @@ macro_rules! newtype_of_reg {
102102
}
103103
}
104104

105-
/// A newtype wrapper around `RegMem` for general-purpose registers.
106-
#[derive(Clone, Debug)]
107-
pub struct $newtype_reg_mem(RegMem);
105+
$(
106+
/// A newtype wrapper around `RegMem` for general-purpose registers.
107+
#[derive(Clone, Debug)]
108+
pub struct $newtype_reg_mem(RegMem);
108109

109-
impl From<$newtype_reg_mem> for RegMem {
110-
fn from(rm: $newtype_reg_mem) -> Self {
111-
rm.0
110+
impl From<$newtype_reg_mem> for RegMem {
111+
fn from(rm: $newtype_reg_mem) -> Self {
112+
rm.0
113+
}
112114
}
113-
}
114115

115-
impl From<$newtype_reg> for $newtype_reg_mem {
116-
fn from(r: $newtype_reg) -> Self {
117-
$newtype_reg_mem(RegMem::reg(r.into()))
116+
impl From<$newtype_reg> for $newtype_reg_mem {
117+
fn from(r: $newtype_reg) -> Self {
118+
$newtype_reg_mem(RegMem::reg(r.into()))
119+
}
118120
}
119-
}
120121

121-
impl $newtype_reg_mem {
122-
/// Construct a `RegMem` newtype from the given `RegMem`, or return
123-
/// `None` if the `RegMem` is not a valid instance of this `RegMem`
124-
/// newtype.
125-
pub fn new(rm: RegMem) -> Option<Self> {
126-
match rm {
127-
RegMem::Mem { addr: _ } => Some(Self(rm)),
128-
RegMem::Reg { reg: $check_reg } if $check => Some(Self(rm)),
129-
RegMem::Reg { reg: _ } => None,
122+
impl $newtype_reg_mem {
123+
/// Construct a `RegMem` newtype from the given `RegMem`, or return
124+
/// `None` if the `RegMem` is not a valid instance of this `RegMem`
125+
/// newtype.
126+
pub fn new(rm: RegMem) -> Option<Self> {
127+
match rm {
128+
RegMem::Mem { addr } => {
129+
let mut _allow = true;
130+
$(
131+
if $aligned {
132+
_allow = addr.aligned();
133+
}
134+
)?
135+
if _allow {
136+
Some(Self(RegMem::Mem { addr }))
137+
} else {
138+
None
139+
}
140+
}
141+
RegMem::Reg { reg: $check_reg } if $check => Some(Self(rm)),
142+
RegMem::Reg { reg: _ } => None,
143+
}
130144
}
131-
}
132145

133-
/// Convert this newtype into its underlying `RegMem`.
134-
pub fn to_reg_mem(self) -> RegMem {
135-
self.0
136-
}
146+
/// Convert this newtype into its underlying `RegMem`.
147+
pub fn to_reg_mem(self) -> RegMem {
148+
self.0
149+
}
137150

138-
#[allow(dead_code)] // Used by some newtypes and not others.
139-
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
140-
&self,
141-
collector: &mut OperandCollector<'_, F>,
142-
) {
143-
self.0.get_operands(collector);
151+
#[allow(dead_code)] // Used by some newtypes and not others.
152+
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
153+
&self,
154+
collector: &mut OperandCollector<'_, F>,
155+
) {
156+
self.0.get_operands(collector);
157+
}
144158
}
145-
}
146-
impl PrettyPrint for $newtype_reg_mem {
147-
fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String {
148-
self.0.pretty_print(size, allocs)
159+
impl PrettyPrint for $newtype_reg_mem {
160+
fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String {
161+
self.0.pretty_print(size, allocs)
162+
}
149163
}
150-
}
164+
)*
151165

152-
/// A newtype wrapper around `RegMemImm`.
153-
#[derive(Clone, Debug)]
154-
pub struct $newtype_reg_mem_imm(RegMemImm);
166+
$(
167+
/// A newtype wrapper around `RegMemImm`.
168+
#[derive(Clone, Debug)]
169+
pub struct $newtype_reg_mem_imm(RegMemImm);
155170

156-
impl From<$newtype_reg_mem_imm> for RegMemImm {
157-
fn from(rmi: $newtype_reg_mem_imm) -> RegMemImm {
158-
rmi.0
159-
}
160-
}
161-
162-
impl From<$newtype_reg> for $newtype_reg_mem_imm {
163-
fn from(r: $newtype_reg) -> Self {
164-
$newtype_reg_mem_imm(RegMemImm::reg(r.into()))
171+
impl From<$newtype_reg_mem_imm> for RegMemImm {
172+
fn from(rmi: $newtype_reg_mem_imm) -> RegMemImm {
173+
rmi.0
174+
}
165175
}
166-
}
167176

168-
impl From<$newtype_reg_mem> for $newtype_reg_mem_imm {
169-
fn from(r: $newtype_reg_mem) -> Self {
170-
$newtype_reg_mem_imm(r.0.into())
177+
impl From<$newtype_reg> for $newtype_reg_mem_imm {
178+
fn from(r: $newtype_reg) -> Self {
179+
$newtype_reg_mem_imm(RegMemImm::reg(r.into()))
180+
}
171181
}
172-
}
173182

174-
impl $newtype_reg_mem_imm {
175-
/// Construct this newtype from the given `RegMemImm`, or return
176-
/// `None` if the `RegMemImm` is not a valid instance of this
177-
/// newtype.
178-
pub fn new(rmi: RegMemImm) -> Option<Self> {
179-
match rmi {
180-
RegMemImm::Imm { .. } => Some(Self(rmi)),
181-
RegMemImm::Mem { addr: _ } => Some(Self(rmi)),
182-
RegMemImm::Reg { reg: $check_reg } if $check => Some(Self(rmi)),
183-
RegMemImm::Reg { reg: _ } => None,
183+
impl $newtype_reg_mem_imm {
184+
/// Construct this newtype from the given `RegMemImm`, or return
185+
/// `None` if the `RegMemImm` is not a valid instance of this
186+
/// newtype.
187+
pub fn new(rmi: RegMemImm) -> Option<Self> {
188+
match rmi {
189+
RegMemImm::Imm { .. } => Some(Self(rmi)),
190+
RegMemImm::Mem { addr } => {
191+
let mut _allow = true;
192+
$(
193+
if $aligned_imm {
194+
_allow = addr.aligned();
195+
}
196+
)?
197+
if _allow {
198+
Some(Self(RegMemImm::Mem { addr }))
199+
} else {
200+
None
201+
}
202+
}
203+
RegMemImm::Reg { reg: $check_reg } if $check => Some(Self(rmi)),
204+
RegMemImm::Reg { reg: _ } => None,
205+
}
184206
}
185-
}
186207

187-
/// Convert this newtype into its underlying `RegMemImm`.
188-
#[allow(dead_code)] // Used by some newtypes and not others.
189-
pub fn to_reg_mem_imm(self) -> RegMemImm {
190-
self.0
191-
}
208+
/// Convert this newtype into its underlying `RegMemImm`.
209+
#[allow(dead_code)] // Used by some newtypes and not others.
210+
pub fn to_reg_mem_imm(self) -> RegMemImm {
211+
self.0
212+
}
192213

193-
#[allow(dead_code)] // Used by some newtypes and not others.
194-
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
195-
&self,
196-
collector: &mut OperandCollector<'_, F>,
197-
) {
198-
self.0.get_operands(collector);
214+
#[allow(dead_code)] // Used by some newtypes and not others.
215+
pub(crate) fn get_operands<F: Fn(VReg) -> VReg>(
216+
&self,
217+
collector: &mut OperandCollector<'_, F>,
218+
) {
219+
self.0.get_operands(collector);
220+
}
199221
}
200-
}
201222

202-
impl PrettyPrint for $newtype_reg_mem_imm {
203-
fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String {
204-
self.0.pretty_print(size, allocs)
223+
impl PrettyPrint for $newtype_reg_mem_imm {
224+
fn pretty_print(&self, size: u8, allocs: &mut AllocationConsumer<'_>) -> String {
225+
self.0.pretty_print(size, allocs)
226+
}
205227
}
206-
}
228+
)*
207229

208230
/// A newtype wrapper around `Imm8Reg`.
209231
#[derive(Clone, Debug)]
@@ -242,8 +264,8 @@ newtype_of_reg!(
242264
Gpr,
243265
WritableGpr,
244266
OptionWritableGpr,
245-
GprMem,
246-
GprMemImm,
267+
reg_mem: (GprMem),
268+
reg_mem_imm: (GprMemImm),
247269
Imm8Gpr,
248270
|reg| reg.class() == RegClass::Int
249271
);
@@ -253,8 +275,8 @@ newtype_of_reg!(
253275
Xmm,
254276
WritableXmm,
255277
OptionWritableXmm,
256-
XmmMem,
257-
XmmMemImm,
278+
reg_mem: (XmmMem, XmmMemAligned aligned:true),
279+
reg_mem_imm: (XmmMemImm, XmmMemAlignedImm aligned:true),
258280
Imm8Xmm,
259281
|reg| reg.class() == RegClass::Float
260282
);
@@ -420,6 +442,10 @@ impl Amode {
420442
}
421443
ret
422444
}
445+
446+
pub(crate) fn aligned(&self) -> bool {
447+
self.get_flags().aligned()
448+
}
423449
}
424450

425451
impl PrettyPrint for Amode {
@@ -531,6 +557,13 @@ impl SyntheticAmode {
531557
}
532558
}
533559
}
560+
561+
pub(crate) fn aligned(&self) -> bool {
562+
match self {
563+
SyntheticAmode::Real(addr) => addr.aligned(),
564+
SyntheticAmode::NominalSPOffset { .. } | SyntheticAmode::ConstantOffset { .. } => true,
565+
}
566+
}
534567
}
535568

536569
impl Into<SyntheticAmode> for Amode {

cranelift/codegen/src/isa/x64/inst/emit.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1731,7 +1731,21 @@ pub(crate) fn emit(
17311731
sink.bind_label(else_label);
17321732
}
17331733

1734-
Inst::XmmUnaryRmR {
1734+
Inst::XmmUnaryRmR { op, src, dst } => {
1735+
emit(
1736+
&Inst::XmmUnaryRmRUnaligned {
1737+
op: *op,
1738+
src: XmmMem::new(src.clone().into()).unwrap(),
1739+
dst: *dst,
1740+
},
1741+
allocs,
1742+
sink,
1743+
info,
1744+
state,
1745+
);
1746+
}
1747+
1748+
Inst::XmmUnaryRmRUnaligned {
17351749
op,
17361750
src: src_e,
17371751
dst: reg_g,
@@ -1842,6 +1856,24 @@ pub(crate) fn emit(
18421856
}
18431857

18441858
Inst::XmmRmR {
1859+
op,
1860+
src1,
1861+
src2,
1862+
dst,
1863+
} => emit(
1864+
&Inst::XmmRmRUnaligned {
1865+
op: *op,
1866+
dst: *dst,
1867+
src1: *src1,
1868+
src2: XmmMem::new(src2.clone().to_reg_mem()).unwrap(),
1869+
},
1870+
allocs,
1871+
sink,
1872+
info,
1873+
state,
1874+
),
1875+
1876+
Inst::XmmRmRUnaligned {
18451877
op,
18461878
src1,
18471879
src2: src_e,

cranelift/codegen/src/isa/x64/inst/emit_tests.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ impl Inst {
3434
debug_assert!(dst.to_reg().class() == RegClass::Float);
3535
Inst::XmmUnaryRmRImm {
3636
op,
37-
src: XmmMem::new(src).unwrap(),
37+
src: XmmMemAligned::new(src).unwrap(),
3838
imm,
3939
dst: WritableXmm::from_writable_reg(dst).unwrap(),
4040
}
@@ -56,7 +56,7 @@ impl Inst {
5656
Inst::XmmRmiReg {
5757
opcode,
5858
src1: Xmm::new(dst.to_reg()).unwrap(),
59-
src2: XmmMemImm::new(src).unwrap(),
59+
src2: XmmMemAlignedImm::new(src).unwrap(),
6060
dst: WritableXmm::from_writable_reg(dst).unwrap(),
6161
}
6262
}
@@ -96,7 +96,7 @@ impl Inst {
9696
debug_assert!(dst.to_reg().class() == RegClass::Float);
9797
Inst::XmmUnaryRmR {
9898
op,
99-
src: XmmMem::new(src).unwrap(),
99+
src: XmmMemAligned::new(src).unwrap(),
100100
dst: WritableXmm::from_writable_reg(dst).unwrap(),
101101
}
102102
}
@@ -136,7 +136,7 @@ impl Inst {
136136
Inst::XmmRmRBlend {
137137
op,
138138
src1: Xmm::new(dst.to_reg()).unwrap(),
139-
src2: XmmMem::new(src2).unwrap(),
139+
src2: XmmMemAligned::new(src2).unwrap(),
140140
mask: Xmm::new(regs::xmm0()).unwrap(),
141141
dst: WritableXmm::from_writable_reg(dst).unwrap(),
142142
}

0 commit comments

Comments
 (0)