geekbeast
diff --git a/‎cranelift/codegen/src/isa/x64/inst.isle‎
Lines changed: 39 additions & 10 deletions b/‎cranelift/codegen/src/isa/x64/inst.isle‎
Lines changed: 39 additions & 10 deletions
diff --git a/‎cranelift/codegen/src/isa/x64/inst/emit.rs‎
Lines changed: 77 additions & 16 deletions b/‎cranelift/codegen/src/isa/x64/inst/emit.rs‎
Lines changed: 77 additions & 16 deletions
diff --git a/‎cranelift/codegen/src/isa/x64/inst/emit_tests.rs‎
Lines changed: 0 additions & 20 deletions b/‎cranelift/codegen/src/isa/x64/inst/emit_tests.rs‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎cranelift/codegen/src/isa/x64/inst/mod.rs‎
Lines changed: 46 additions & 2 deletions b/‎cranelift/codegen/src/isa/x64/inst/mod.rs‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎cranelift/codegen/src/isa/x64/lower.isle‎
Lines changed: 25 additions & 8 deletions b/‎cranelift/codegen/src/isa/x64/lower.isle‎
Lines changed: 25 additions & 8 deletions
@@ -437,6 +437,23 @@
                  (dst WritableXmm)
                  (src_size OperandSize))
 
+       ;; Conversion from signed integers to floats, the `{v,}`cvtsi2s{s,d}`
+       ;; instructions.
+       ;;
+       ;; Note that this is special in that `src1` is an xmm/float register
+       ;; while `src2` is a general purpose register as this is converting an
+       ;; integer in a gpr to an equivalent float in an xmm reg.
+       (CvtIntToFloat (op SseOpcode)
+                      (src1 Xmm)
+                      (src2 GprMem)
+                      (dst WritableXmm)
+                      (src2_size OperandSize))
+       (CvtIntToFloatVex (op AvxOpcode)
+                         (src1 Xmm)
+                         (src2 GprMem)
+                         (dst WritableXmm)
+                         (src2_size OperandSize))
+
        ;; Converts an unsigned int64 to a float32/float64.
        (CvtUint64ToFloatSeq (dst_size OperandSize) ;; 4 or 8
                             (src Gpr)
@@ -2095,6 +2112,18 @@
             (_ Unit (emit (MInst.UnaryRmRImmVex size op src dst imm))))
         dst))
 
+(decl cvt_int_to_float (SseOpcode Xmm GprMem OperandSize) Xmm)
+(rule (cvt_int_to_float op src1 src2 size)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.CvtIntToFloat op src1 src2 dst size))))
+        dst))
+
+(decl cvt_int_to_float_vex (AvxOpcode Xmm GprMem OperandSize) Xmm)
+(rule (cvt_int_to_float_vex op src1 src2 size)
+      (let ((dst WritableXmm (temp_writable_xmm))
+            (_ Unit (emit (MInst.CvtIntToFloatVex op src1 src2 dst size))))
+        dst))
+
 (decl cvt_u64_to_float_seq (Type Gpr) Xmm)
 (rule (cvt_u64_to_float_seq ty src)
       (let ((size OperandSize (raw_operand_size_of_type ty))
@@ -4351,20 +4380,20 @@
         (xmm_unary_rm_r_vex (AvxOpcode.Vcvtdq2pd) x))
 
 ;; Helper for creating `cvtsi2ss` instructions.
-(decl x64_cvtsi2ss (Type GprMem) Xmm)
-(rule (x64_cvtsi2ss ty x)
-      (gpr_to_xmm (SseOpcode.Cvtsi2ss) x (raw_operand_size_of_type ty)))
-(rule 1 (x64_cvtsi2ss ty x)
+(decl x64_cvtsi2ss (Type Xmm GprMem) Xmm)
+(rule (x64_cvtsi2ss ty x y)
+      (cvt_int_to_float (SseOpcode.Cvtsi2ss) x y (raw_operand_size_of_type ty)))
+(rule 1 (x64_cvtsi2ss ty x y)
         (if-let $true (use_avx))
-        (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2ss) x (raw_operand_size_of_type ty)))
+        (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2ss) x y (raw_operand_size_of_type ty)))
 
 ;; Helper for creating `cvtsi2sd` instructions.
-(decl x64_cvtsi2sd (Type GprMem) Xmm)
-(rule (x64_cvtsi2sd ty x)
-      (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty)))
-(rule 1 (x64_cvtsi2sd ty x)
+(decl x64_cvtsi2sd (Type Xmm GprMem) Xmm)
+(rule (x64_cvtsi2sd ty x y)
+      (cvt_int_to_float (SseOpcode.Cvtsi2sd) x y (raw_operand_size_of_type ty)))
+(rule 1 (x64_cvtsi2sd ty x y)
         (if-let $true (use_avx))
-        (gpr_to_xmm_vex (AvxOpcode.Vcvtsi2sd) x (raw_operand_size_of_type ty)))
+        (cvt_int_to_float_vex (AvxOpcode.Vcvtsi2sd) x y (raw_operand_size_of_type ty)))
 
 ;; Helper for creating `cvttps2dq` instructions.
 (decl x64_cvttps2dq (XmmMem) Xmm)
 
@@ -32,8 +32,14 @@ fn emit_signed_cvt(
     } else {
         SseOpcode::Cvtsi2ss
     };
-    let inst = Inst::gpr_to_xmm(op, RegMem::reg(src), OperandSize::Size64, dst);
-    inst.emit(&[], sink, info, state);
+    Inst::CvtIntToFloat {
+        op,
+        dst: Writable::from_reg(Xmm::new(dst.to_reg()).unwrap()),
+        src1: Xmm::new(dst.to_reg()).unwrap(),
+        src2: GprMem::new(RegMem::reg(src)).unwrap(),
+        src2_size: OperandSize::Size64,
+    }
+    .emit(&[], sink, info, state);
 }
 
 /// Emits a one way conditional jump if CC is set (true).
@@ -2872,30 +2878,21 @@ pub(crate) fn emit(
             let (prefix, map, opcode) = match op {
                 // vmovd/vmovq are differentiated by `w`
                 AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
-                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
-                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
             let w = match src_size {
                 OperandSize::Size64 => true,
                 _ => false,
             };
-            let mut insn = VexInstruction::new()
+            VexInstruction::new()
                 .length(VexVectorLength::V128)
                 .w(w)
                 .prefix(prefix)
                 .map(map)
                 .opcode(opcode)
                 .rm(src)
-                .reg(dst.to_real_reg().unwrap().hw_enc());
-            // These opcodes technically take a second operand which is the
-            // upper bits to preserve during the float conversion. We don't
-            // actually use this in this backend right now so reuse the
-            // destination register. This at least matches what LLVM does.
-            if let AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd = op {
-                insn = insn.vvvv(dst.to_real_reg().unwrap().hw_enc());
-            }
-            insn.encode(sink);
+                .reg(dst.to_real_reg().unwrap().hw_enc())
+                .encode(sink);
         }
 
         Inst::XmmRmREvex {
@@ -3200,8 +3197,6 @@ pub(crate) fn emit(
                 // Movd and movq use the same opcode; the presence of the REX prefix (set below)
                 // actually determines which is used.
                 SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
-                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
-                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
                 _ => panic!("unexpected opcode {:?}", op),
             };
             let rex = RexFlags::from(*src_size);
@@ -3239,6 +3234,72 @@ pub(crate) fn emit(
             }
         }
 
+        Inst::CvtIntToFloat {
+            op,
+            src1,
+            src2,
+            dst,
+            src2_size,
+        } => {
+            let src1 = allocs.next(src1.to_reg());
+            let dst = allocs.next(dst.to_reg().to_reg());
+            assert_eq!(src1, dst);
+            let src2 = src2.clone().to_reg_mem().with_allocs(allocs);
+
+            let (prefix, opcode) = match op {
+                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
+                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
+                _ => panic!("unexpected opcode {:?}", op),
+            };
+            let rex = RexFlags::from(*src2_size);
+            match src2 {
+                RegMem::Reg { reg: src2 } => {
+                    emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
+                }
+                RegMem::Mem { addr } => {
+                    let addr = &addr.finalize(state, sink);
+                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
+                }
+            }
+        }
+
+        Inst::CvtIntToFloatVex {
+            op,
+            src1,
+            src2,
+            dst,
+            src2_size,
+        } => {
+            let dst = allocs.next(dst.to_reg().to_reg());
+            let src1 = allocs.next(src1.to_reg());
+            let src2 = match src2.clone().to_reg_mem().with_allocs(allocs) {
+                RegMem::Reg { reg } => {
+                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
+                }
+                RegMem::Mem { addr } => RegisterOrAmode::Amode(addr.finalize(state, sink)),
+            };
+
+            let (prefix, map, opcode) = match op {
+                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
+                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
+                _ => unimplemented!("Opcode {:?} not implemented", op),
+            };
+            let w = match src2_size {
+                OperandSize::Size64 => true,
+                _ => false,
+            };
+            VexInstruction::new()
+                .length(VexVectorLength::V128)
+                .w(w)
+                .prefix(prefix)
+                .map(map)
+                .opcode(opcode)
+                .rm(src2)
+                .reg(dst.to_real_reg().unwrap().hw_enc())
+                .vvvv(src1.to_real_reg().unwrap().hw_enc())
+                .encode(sink);
+        }
+
         Inst::CvtUint64ToFloatSeq {
             dst_size,
             src,
 
@@ -5051,26 +5051,6 @@ fn test_x64_emit() {
         "664C0F6EFF",
         "movq    %rdi, %xmm15",
     ));
-    insns.push((
-        Inst::gpr_to_xmm(
-            SseOpcode::Cvtsi2ss,
-            RegMem::reg(rdi),
-            OperandSize::Size32,
-            w_xmm15,
-        ),
-        "F3440F2AFF",
-        "cvtsi2ss %edi, %xmm15",
-    ));
-    insns.push((
-        Inst::gpr_to_xmm(
-            SseOpcode::Cvtsi2sd,
-            RegMem::reg(rsi),
-            OperandSize::Size64,
-            w_xmm1,
-        ),
-        "F2480F2ACE",
-        "cvtsi2sd %rsi, %xmm1",
-    ));
 
     // ========================================================
     // XmmRmi
 
@@ -178,7 +178,8 @@ impl Inst {
             | Inst::XmmToGprImm { op, .. }
             | Inst::XmmUnaryRmRImm { op, .. }
             | Inst::XmmUnaryRmRUnaligned { op, .. }
-            | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()],
+            | Inst::XmmUnaryRmR { op, .. }
+            | Inst::CvtIntToFloat { op, .. } => smallvec![op.available_from()],
 
             Inst::XmmUnaryRmREvex { op, .. }
             | Inst::XmmRmREvex { op, .. }
@@ -196,7 +197,8 @@ impl Inst {
             | Inst::XmmMovRMImmVex { op, .. }
             | Inst::XmmToGprImmVex { op, .. }
             | Inst::XmmToGprVex { op, .. }
-            | Inst::GprToXmmVex { op, .. } => op.available_from(),
+            | Inst::GprToXmmVex { op, .. }
+            | Inst::CvtIntToFloatVex { op, .. } => op.available_from(),
         }
     }
 }
@@ -1296,6 +1298,34 @@ impl PrettyPrint for Inst {
                 format!("{op} {src}, {dst}")
             }
 
+            Inst::CvtIntToFloat {
+                op,
+                src1,
+                src2,
+                dst,
+                src2_size,
+            } => {
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
+                let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
+                let op = ljustify(op.to_string());
+                format!("{op} {src1}, {src2}, {dst}")
+            }
+
+            Inst::CvtIntToFloatVex {
+                op,
+                src1,
+                src2,
+                dst,
+                src2_size,
+            } => {
+                let dst = pretty_print_reg(*dst.to_reg(), 8, allocs);
+                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
+                let src2 = src2.pretty_print(src2_size.to_bytes(), allocs);
+                let op = ljustify(op.to_string());
+                format!("{op} {src1}, {src2}, {dst}")
+            }
+
             Inst::CvtUint64ToFloatSeq {
                 src,
                 dst,
@@ -2164,6 +2194,20 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
             collector.reg_def(dst.to_writable_reg());
             src.get_operands(collector);
         }
+        Inst::CvtIntToFloat {
+            src1, src2, dst, ..
+        } => {
+            collector.reg_use(src1.to_reg());
+            collector.reg_reuse_def(dst.to_writable_reg(), 0);
+            src2.get_operands(collector);
+        }
+        Inst::CvtIntToFloatVex {
+            src1, src2, dst, ..
+        } => {
+            collector.reg_def(dst.to_writable_reg());
+            collector.reg_use(src1.to_reg());
+            src2.get_operands(collector);
+        }
         Inst::CvtUint64ToFloatSeq {
             src,
             dst,
 
@@ -3336,23 +3336,40 @@
 
 ;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
+;; conversion instruction in isolation, it also takes the upper 64-bits of an
+;; xmm register and places it into the destination. We don't actually want that
+;; to happen as it could accidentally create a false dependency with a
+;; previous instruction defining the register's upper 64-bits. See #7085 for
+;; an instance of this.
+;;
+;; This means that the first operand to all of the int-to-float conversions here
+;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
+;; dependencies on other instructions.
+;;
+;; Ideally this would be lifted out to a higher level to get deduplicated
+;; between consecutive int-to-float operations but that's not easy
+;; to do at this time. One possibility would be a mid-end rule which rewrites
+;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
+;; be subject to normal LICM, but that's not feasible today.
+
 (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I8))))
-      (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 2 (lower (has_type $F32 (fcvt_from_sint a @ (value_type $I16))))
-      (x64_cvtsi2ss $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 1 (lower (has_type $F32 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
-      (x64_cvtsi2ss ty a))
+      (x64_cvtsi2ss ty (xmm_zero $F32X4) a))
 
 (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I8))))
-      (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 2 (lower (has_type $F64 (fcvt_from_sint a @ (value_type $I16))))
-      (x64_cvtsi2sd $I32 (extend_to_gpr a $I32 (ExtendKind.Sign))))
+      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
 
 (rule 1 (lower (has_type $F64 (fcvt_from_sint a @ (value_type (ty_int (fits_in_64 ty))))))
-      (x64_cvtsi2sd ty a))
+      (x64_cvtsi2sd ty (xmm_zero $F64X2) a))
 
 (rule 0 (lower (fcvt_from_sint a @ (value_type $I32X4)))
       (x64_cvtdq2ps a))
@@ -3363,10 +3380,10 @@
 ;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 1 (lower (has_type $F32 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
-      (x64_cvtsi2ss $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+      (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))
 
 (rule 1 (lower (has_type $F64 (fcvt_from_uint val @ (value_type (fits_in_32 (ty_int ty))))))
-      (x64_cvtsi2sd $I64 (extend_to_gpr val $I64 (ExtendKind.Zero))))
+      (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))
 
 (rule (lower (has_type ty (fcvt_from_uint val @ (value_type $I64))))
       (cvt_u64_to_float_seq ty val))