Skip to content

Commit 9982992

Browse files
authored
x64: Fix false dependencies in int-to-float conversions (bytecodealliance#7098)
* x64: Fix false dependencies in int-to-float conversions This commit is a result of the investigation on bytecodealliance#7085. The int-to-float conversion instructions used right now on the x64 backend will implicitly source the upper bits of the result from a different register. This implicitly creates a dependency on further consumers using the conversion result on whatever previously defined the upper bits, even though they aren't used. This false dependency is the primary reason for the slowdown witnessed in bytecodealliance#7085. The fix chosen in this commit is to model the int-to-float instructions with a new shape of instruction instead of the previous `GprToXmm{,Vex}`. This previous shape was modeled as single-input and single-output, but this does not reflect the actual nature of the `cvtsi2s{s,d}` instructions. Instead these now use `CvtIntToFloat{,Vex}` which have two source operands and one destination operand, modeling how the upper bits of a different register are used. In lowerings using this instruction the upper bits to preserver are always sourced from a zero'd out register to force breaking dependencies between instructions. Closes bytecodealliance#7085 * Remove now dead code * Remove outdated test Golden test output covers this test case anyway nowadays * Review comments * Fix emit tests
1 parent b7c0eae commit 9982992

File tree

8 files changed

+306
-118
lines changed

8 files changed

+306
-118
lines changed

cranelift/codegen/src/isa/x64/inst.isle

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,23 @@
437437
(dst WritableXmm)
438438
(src_size OperandSize))
439439

440+
;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}`
441+
;; instructions.
442+
;;
443+
;; Note that this is special in that `src1` is an xmm/float register
444+
;; while `src2` is a general purpose register as this is converting an
445+
;; integer in a gpr to an equivalent float in an xmm reg.
446+
(CvtIntToFloat (op SseOpcode)
447+
(src1 Xmm)
448+
(src2 GprMem)
449+
(dst WritableXmm)
450+
(src2_size OperandSize))
451+
(CvtIntToFloatVex (op AvxOpcode)
452+
(src1 Xmm)
453+
(src2 GprMem)
454+
(dst WritableXmm)
455+
(src2_size OperandSize))
456+
440457
;; Converts an unsigned int64 to a float32/float64.
441458
(CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
442459
(src Gpr)
@@ -2095,6 +2112,18 @@
20952112
(_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
20962113
dst))
20972114

2115+
(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm)
2116+
(rule (cvt_int_to_float op src1 src2 size)
2117+
(let ((dst WritableXmm (temp_writable_xmm))
2118+
(_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size))))
2119+
dst))
2120+
2121+
(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm)
2122+
(rule (cvt_int_to_float_vex op src1 src2 size)
2123+
(let ((dst WritableXmm (temp_writable_xmm))
2124+
(_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size))))
2125+
dst))
2126+
20982127
(decl cvt_u64_to_float_seq (Type Gpr) Xmm)
20992128
(rule (cvt_u64_to_float_seq ty src)
21002129
(let ((size OperandSize (raw_operand_size_of_type ty))
@@ -4351,20 +4380,20 @@
43514380
(xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x))
43524381

43534382
;; Helper for creating `cvtsi2ss` instructions.
4354-
(decl x64_cvtsi2ss (Type GprMem) Xmm)
4355-
(rule (x64_cvtsi2ss ty x)
4356-
(gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty)))
4357-
(rule 1 (x64_cvtsi2ss ty x)
4383+
(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm)
4384+
(rule (x64_cvtsi2ss ty x y)
4385+
(cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty)))
4386+
(rule 1 (x64_cvtsi2ss ty x y)
43584387
(if-let $true (use_avx))
4359-
(gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty)))
4388+
(cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty)))
43604389

43614390
;; Helper for creating `cvtsi2sd` instructions.
4362-
(decl x64_cvtsi2sd (Type GprMem) Xmm)
4363-
(rule (x64_cvtsi2sd ty x)
4364-
(gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty)))
4365-
(rule 1 (x64_cvtsi2sd ty x)
4391+
(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm)
4392+
(rule (x64_cvtsi2sd ty x y)
4393+
(cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty)))
4394+
(rule 1 (x64_cvtsi2sd ty x y)
43664395
(if-let $true (use_avx))
4367-
(gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty)))
4396+
(cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty)))
43684397

43694398
;; Helper for creating `cvttps2dq` instructions.
43704399
(decl x64_cvttps2dq (XmmMem) Xmm)

cranelift/codegen/src/isa/x64/inst/emit.rs

Lines changed: 77 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,14 @@ fn emit_signed_cvt(
3232
} else {
3333
SseOpcode::Cvtsi2ss
3434
};
35-
let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
36-
inst.emit(&[], sink, info, state);
35+
Inst::CvtIntToFloat {
36+
op,
37+
dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()),
38+
src1: Xmm::new(dst.to_reg()).unwrap(),
39+
src2: GprMem::new(RegMem::reg(src)).unwrap(),
40+
src2_size: OperandSize::Size64,
41+
}
42+
.emit(&[], sink, info, state);
3743
}
3844

3945
/// Emits a one way conditional jump if CC is set (true).
@@ -2872,30 +2878,21 @@ pub(crate) fn emit(
28722878
let (prefix, map, opcode) = match op {
28732879
// vmovd/vmovq are differentiated by `w`
28742880
AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
2875-
AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
2876-
AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
28772881
_ => unimplemented!("Opcode {:?} not implemented", op),
28782882
};
28792883
let w = match src_size {
28802884
OperandSize::Size64 => true,
28812885
_ => false,
28822886
};
2883-
let mut insn = VexInstruction::new()
2887+
VexInstruction::new()
28842888
.length(VexVectorLength::V128)
28852889
.w(w)
28862890
.prefix(prefix)
28872891
.map(map)
28882892
.opcode(opcode)
28892893
.rm(src)
2890-
.reg(dst.to_real_reg().unwrap().hw_enc());
2891-
// These opcodes technically take a second operand which is the
2892-
// upper bits to preserve during the float conversion. We don't
2893-
// actually use this in this backend right now so reuse the
2894-
// destination register. This at least matches what LLVM does.
2895-
if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op {
2896-
insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc());
2897-
}
2898-
insn.encode(sink);
2894+
.reg(dst.to_real_reg().unwrap().hw_enc())
2895+
.encode(sink);
28992896
}
29002897

29012898
Inst::XmmRmREvex {
@@ -3200,8 +3197,6 @@ pub(crate) fn emit(
32003197
// Movd and movq use the same opcode; the presence of the REX prefix (set below)
32013198
// actually determines which is used.
32023199
SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
3203-
SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
3204-
SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
32053200
_ => panic!("unexpected opcode {:?}", op),
32063201
};
32073202
let rex = RexFlags::from(*src_size);
@@ -3239,6 +3234,72 @@ pub(crate) fn emit(
32393234
}
32403235
}
32413236

3237+
Inst::CvtIntToFloat {
3238+
op,
3239+
src1,
3240+
src2,
3241+
dst,
3242+
src2_size,
3243+
} => {
3244+
let src1 = allocs.next(src1.to_reg());
3245+
let dst = allocs.next(dst.to_reg().to_reg());
3246+
assert_eq!(src1, dst);
3247+
let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
3248+
3249+
let (prefix, opcode) = match op {
3250+
SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
3251+
SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
3252+
_ => panic!("unexpected opcode {:?}", op),
3253+
};
3254+
let rex = RexFlags::from(*src2_size);
3255+
match src2 {
3256+
RegMem::Reg { reg: src2 } => {
3257+
emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
3258+
}
3259+
RegMem::Mem { addr } => {
3260+
let addr = &addr.finalize(state, sink);
3261+
emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
3262+
}
3263+
}
3264+
}
3265+
3266+
Inst::CvtIntToFloatVex {
3267+
op,
3268+
src1,
3269+
src2,
3270+
dst,
3271+
src2_size,
3272+
} => {
3273+
let dst = allocs.next(dst.to_reg().to_reg());
3274+
let src1 = allocs.next(src1.to_reg());
3275+
let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
3276+
RegMem::Reg { reg } => {
3277+
RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
3278+
}
3279+
RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
3280+
};
3281+
3282+
let (prefix, map, opcode) = match op {
3283+
AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
3284+
AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
3285+
_ => unimplemented!("Opcode {:?} not implemented", op),
3286+
};
3287+
let w = match src2_size {
3288+
OperandSize::Size64 => true,
3289+
_ => false,
3290+
};
3291+
VexInstruction::new()
3292+
.length(VexVectorLength::V128)
3293+
.w(w)
3294+
.prefix(prefix)
3295+
.map(map)
3296+
.opcode(opcode)
3297+
.rm(src2)
3298+
.reg(dst.to_real_reg().unwrap().hw_enc())
3299+
.vvvv(src1.to_real_reg().unwrap().hw_enc())
3300+
.encode(sink);
3301+
}
3302+
32423303
Inst::CvtUint64ToFloatSeq {
32433304
dst_size,
32443305
src,

cranelift/codegen/src/isa/x64/inst/emit_tests.rs

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5051,26 +5051,6 @@ fn test_x64_emit() {
50515051
"664C0F6EFF",
50525052
"movq %rdi, %xmm15",
50535053
));
5054-
insns.push((
5055-
Inst::gpr_to_xmm(
5056-
SseOpcode::Cvtsi2ss,
5057-
RegMem::reg(rdi),
5058-
OperandSize::Size32,
5059-
w_xmm15,
5060-
),
5061-
"F3440F2AFF",
5062-
"cvtsi2ss %edi, %xmm15",
5063-
));
5064-
insns.push((
5065-
Inst::gpr_to_xmm(
5066-
SseOpcode::Cvtsi2sd,
5067-
RegMem::reg(rsi),
5068-
OperandSize::Size64,
5069-
w_xmm1,
5070-
),
5071-
"F2480F2ACE",
5072-
"cvtsi2sd %rsi, %xmm1",
5073-
));
50745054

50755055
// ========================================================
50765056
// XmmRmi

cranelift/codegen/src/isa/x64/inst/mod.rs

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ impl Inst {
178178
| Inst::XmmToGprImm { op, .. }
179179
| Inst::XmmUnaryRmRImm { op, .. }
180180
| Inst::XmmUnaryRmRUnaligned { op, .. }
181-
| Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
181+
| Inst::XmmUnaryRmR { op, .. }
182+
| Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()],
182183

183184
Inst::XmmUnaryRmREvex { op, .. }
184185
| Inst::XmmRmREvex { op, .. }
@@ -196,7 +197,8 @@ impl Inst {
196197
| Inst::XmmMovRMImmVex { op, .. }
197198
| Inst::XmmToGprImmVex { op, .. }
198199
| Inst::XmmToGprVex { op, .. }
199-
| Inst::GprToXmmVex { op, .. } => op.available_from(),
200+
| Inst::GprToXmmVex { op, .. }
201+
| Inst::CvtIntToFloatVex { op, .. } => op.available_from(),
200202
}
201203
}
202204
}
@@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst {
12961298
format!("{op} {src}, {dst}")
12971299
}
12981300

1301+
Inst::CvtIntToFloat {
1302+
op,
1303+
src1,
1304+
src2,
1305+
dst,
1306+
src2_size,
1307+
} => {
1308+
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
1309+
let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
1310+
let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
1311+
let op = ljustify(op.to_string());
1312+
format!("{op} {src1}, {src2}, {dst}")
1313+
}
1314+
1315+
Inst::CvtIntToFloatVex {
1316+
op,
1317+
src1,
1318+
src2,
1319+
dst,
1320+
src2_size,
1321+
} => {
1322+
let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
1323+
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
1324+
let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
1325+
let op = ljustify(op.to_string());
1326+
format!("{op} {src1}, {src2}, {dst}")
1327+
}
1328+
12991329
Inst::CvtUint64ToFloatSeq {
13001330
src,
13011331
dst,
@@ -2164,6 +2194,20 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
21642194
collector.reg_def(dst.to_writable_reg());
21652195
src.get_operands(collector);
21662196
}
2197+
Inst::CvtIntToFloat {
2198+
src1, src2, dst, ..
2199+
} => {
2200+
collector.reg_use(src1.to_reg());
2201+
collector.reg_reuse_def(dst.to_writable_reg(), 0);
2202+
src2.get_operands(collector);
2203+
}
2204+
Inst::CvtIntToFloatVex {
2205+
src1, src2, dst, ..
2206+
} => {
2207+
collector.reg_def(dst.to_writable_reg());
2208+
collector.reg_use(src1.to_reg());
2209+
src2.get_operands(collector);
2210+
}
21672211
Inst::CvtUint64ToFloatSeq {
21682212
src,
21692213
dst,

cranelift/codegen/src/isa/x64/lower.isle

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3336,23 +3336,40 @@
33363336

33373337
;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33383338

3339+
;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
3340+
;; conversion instruction in isolation, it also takes the upper 64-bits of an
3341+
;; xmm register and places it into the destination. We don't actually want that
3342+
;; to happen as it could accidentally create a false dependency with a
3343+
;; previous instruction defining the register's upper 64-bits. See #7085 for
3344+
;; an instance of this.
3345+
;;
3346+
;; This means that the first operand to all of the int-to-float conversions here
3347+
;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
3348+
;; dependencies on other instructions.
3349+
;;
3350+
;; Ideally this would be lifted out to a higher level to get deduplicated
3351+
;; between consecutive int-to-float operations but that's not easy
3352+
;; to do at this time. One possibility would be a mid-end rule which rewrites
3353+
;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
3354+
;; be subject to normal LICM, but that's not feasible today.
3355+
33393356
(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8))))
3340-
(x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
3357+
(x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
33413358

33423359
(rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16))))
3343-
(x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
3360+
(x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
33443361

33453362
(rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
3346-
(x64_cvtsi2ss ty a))
3363+
(x64_cvtsi2ss ty (xmm_zero $F32X4) a))
33473364

33483365
(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8))))
3349-
(x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
3366+
(x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
33503367

33513368
(rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16))))
3352-
(x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
3369+
(x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
33533370

33543371
(rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
3355-
(x64_cvtsi2sd ty a))
3372+
(x64_cvtsi2sd ty (xmm_zero $F64X2) a))
33563373

33573374
(rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4)))
33583375
(x64_cvtdq2ps a))
@@ -3363,10 +3380,10 @@
33633380
;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33643381

33653382
(rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
3366-
(x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
3383+
(x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))
33673384

33683385
(rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
3369-
(x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
3386+
(x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))
33703387

33713388
(rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
33723389
(cvt_u64_to_float_seq ty val))

0 commit comments

Comments
 (0)