Implement wide-arithmetic for Winch

alexcrichton · alexcrichton · commit b5054fdacce0 · 2024-10-22T15:55:06.000-07:00
This commit implements the wide-arithmetic proposal for Winch on x64.
This is mostly for me to get my feet wet doing things in Winch. The
proposal itself is relatively modest with just four new instructions.
diff --git a/crates/fuzzing/src/generators/config.rs b/crates/fuzzing/src/generators/config.rs
@@ -487,7 +487,6 @@ impl WasmtimeConfig {
             config.threads_enabled = false;
             config.tail_call_enabled = false;
             config.reference_types_enabled = false;
-            config.wide_arithmetic_enabled = false;
 
             // Tuning  the following engine options is currently not supported
             // by Winch.
diff --git a/tests/wast.rs b/tests/wast.rs
@@ -174,8 +174,6 @@ fn should_fail(test: &Path, strategy: Strategy) -> bool {
             "spec_testsuite/simd_store32_lane.wast",
             "spec_testsuite/simd_store64_lane.wast",
             "spec_testsuite/simd_store8_lane.wast",
-            // wide arithmetic
-            "misc_testsuite/wide-arithmetic.wast",
         ];
 
         if unsupported.iter().any(|part| test.ends_with(part)) {
diff --git a/winch/codegen/src/codegen/context.rs b/winch/codegen/src/codegen/context.rs
@@ -590,4 +590,22 @@ impl<'a> CodeGenContext<'a> {
             _ => {}
         });
     }
+
+    /// Prepares for emitting a binary operation where four 64-bit operands are
+    /// used to produce two 64-bit operands, e.g. a 128-bit binop.
+    pub fn binop128<F, M>(&mut self, masm: &mut M, emit: F)
+    where
+        F: FnOnce(&mut M, Reg, Reg, Reg, Reg) -> (TypedReg, TypedReg),
+        M: MacroAssembler,
+    {
+        let rhs_hi = self.pop_to_reg(masm, None);
+        let rhs_lo = self.pop_to_reg(masm, None);
+        let lhs_hi = self.pop_to_reg(masm, None);
+        let lhs_lo = self.pop_to_reg(masm, None);
+        let (lo, hi) = emit(masm, lhs_lo.reg, lhs_hi.reg, rhs_lo.reg, rhs_hi.reg);
+        self.free_reg(rhs_hi);
+        self.free_reg(rhs_lo);
+        self.stack.push(lo.into());
+        self.stack.push(hi.into());
+    }
 }
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
@@ -5,8 +5,8 @@ use crate::{
     isa::reg::{writable, Reg, WritableReg},
     masm::{
         CalleeKind, DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind,
-        MacroAssembler as Masm, OperandSize, RegImm, RemKind, RoundingMode, SPOffset, ShiftKind,
-        StackSlot, TrapCode, TruncKind,
+        MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, RoundingMode, SPOffset,
+        ShiftKind, StackSlot, TrapCode, TruncKind,
     },
 };
 use cranelift_codegen::{
@@ -673,6 +673,37 @@ impl Masm for MacroAssembler {
     fn current_code_offset(&self) -> CodeOffset {
         self.asm.buffer().cur_offset()
     }
+
+    fn add128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    ) {
+        let _ = (dst_lo, dst_hi, lhs_lo, lhs_hi, rhs_lo, rhs_hi);
+        todo!()
+    }
+
+    fn sub128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    ) {
+        let _ = (dst_lo, dst_hi, lhs_lo, lhs_hi, rhs_lo, rhs_hi);
+        todo!()
+    }
+
+    fn mul_wide(&mut self, context: &mut CodeGenContext, kind: MulWideKind) {
+        let _ = (context, kind);
+        todo!()
+    }
 }
 
 impl MacroAssembler {
diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
@@ -2,7 +2,9 @@
 
 use crate::{
     isa::reg::Reg,
-    masm::{DivKind, ExtendKind, IntCmpKind, OperandSize, RemKind, RoundingMode, ShiftKind},
+    masm::{
+        DivKind, ExtendKind, IntCmpKind, MulWideKind, OperandSize, RemKind, RoundingMode, ShiftKind,
+    },
 };
 use cranelift_codegen::{
     ir::{
@@ -1363,6 +1365,45 @@ impl Assembler {
             size: size.into(),
         });
     }
+
+    pub fn adc_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
+        self.emit(Inst::AluRmiR {
+            size: size.into(),
+            op: AluRmiROpcode::Adc,
+            src1: dst.to_reg().into(),
+            src2: src.into(),
+            dst: dst.map(Into::into),
+        });
+    }
+
+    pub fn sbb_rr(&mut self, src: Reg, dst: WritableReg, size: OperandSize) {
+        self.emit(Inst::AluRmiR {
+            size: size.into(),
+            op: AluRmiROpcode::Sbb,
+            src1: dst.to_reg().into(),
+            src2: src.into(),
+            dst: dst.map(Into::into),
+        });
+    }
+
+    pub fn mul_wide(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs: Reg,
+        rhs: Reg,
+        kind: MulWideKind,
+        size: OperandSize,
+    ) {
+        self.emit(Inst::Mul {
+            signed: kind == MulWideKind::Signed,
+            size: size.into(),
+            src1: lhs.into(),
+            src2: rhs.into(),
+            dst_lo: dst_lo.to_reg().into(),
+            dst_hi: dst_hi.to_reg().into(),
+        });
+    }
 }
 
 /// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
@@ -6,13 +6,14 @@ use super::{
 };
 
 use crate::masm::{
-    DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, MacroAssembler as Masm, OperandSize,
-    RegImm, RemKind, RoundingMode, ShiftKind, TrapCode, TruncKind, TRUSTED_FLAGS, UNTRUSTED_FLAGS,
+    DivKind, ExtendKind, FloatCmpKind, Imm as I, IntCmpKind, MacroAssembler as Masm, MulWideKind,
+    OperandSize, RegImm, RemKind, RoundingMode, ShiftKind, TrapCode, TruncKind, TRUSTED_FLAGS,
+    UNTRUSTED_FLAGS,
 };
 use crate::{
     abi::{self, align_to, calculate_frame_adjustment, LocalSlot},
     codegen::{ptr_type_from_ptr_size, CodeGenContext, FuncEnv},
-    stack::Val,
+    stack::{TypedReg, Val},
 };
 use crate::{
     abi::{vmctx, ABI},
@@ -996,6 +997,68 @@ impl Masm for MacroAssembler {
     fn current_code_offset(&self) -> CodeOffset {
         self.asm.buffer().cur_offset()
     }
+
+    fn add128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    ) {
+        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo);
+        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi);
+        self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
+        self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
+    }
+
+    fn sub128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    ) {
+        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo);
+        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi);
+        self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
+        self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
+    }
+
+    fn mul_wide(&mut self, context: &mut CodeGenContext, kind: MulWideKind) {
+        // Reserve rax/rdx since they're required by the `mul_wide` instruction
+        // being used here.
+        let rax = context.reg(regs::rax(), self);
+        let rdx = context.reg(regs::rdx(), self);
+
+        // The rhs of this binop can be in any register
+        let rhs = context.pop_to_reg(self, None);
+        // Mark rax as allocatable. and then force the lhs operand to be placed
+        // in `rax`.
+        context.free_reg(rax);
+        let lhs = context.pop_to_reg(self, Some(rax));
+
+        self.asm.mul_wide(
+            writable!(rax),
+            writable!(rdx),
+            lhs.reg,
+            rhs.reg,
+            kind,
+            OperandSize::S64,
+        );
+
+        // No longer using the rhs register after the multiplication has been
+        // executed.
+        context.free_reg(rhs);
+
+        // The low bits of the result are in rax, where `lhs` was allocated to
+        context.stack.push(lhs.into());
+        // The high bits of the result are in rdx, which we previously reserved.
+        context.stack.push(Val::Reg(TypedReg::i64(rdx)));
+    }
 }
 
 impl MacroAssembler {
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
@@ -27,6 +27,12 @@ pub(crate) enum RemKind {
     Unsigned,
 }
 
+#[derive(Eq, PartialEq)]
+pub(crate) enum MulWideKind {
+    Signed,
+    Unsigned,
+}
+
 /// The direction to perform the memory move.
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub(crate) enum MemMoveDirection {
@@ -1019,4 +1025,33 @@ pub(crate) trait MacroAssembler {
 
     /// The current offset, in bytes from the beginning of the function.
     fn current_code_offset(&self) -> CodeOffset;
+
+    /// Performs a 128-bit addition
+    fn add128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    );
+
+    /// Performs a 128-bit subtraction
+    fn sub128(
+        &mut self,
+        dst_lo: WritableReg,
+        dst_hi: WritableReg,
+        lhs_lo: Reg,
+        lhs_hi: Reg,
+        rhs_lo: Reg,
+        rhs_hi: Reg,
+    );
+
+    /// Performs a widening multiplication from two 64-bit operands into a
+    /// 128-bit result.
+    ///
+    /// Note that some platforms require special handling of registers in this
+    /// instruction (e.g. x64) so full access to `CodeGenContext` is provided.
+    fn mul_wide(&mut self, context: &mut CodeGenContext, kind: MulWideKind);
 }
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
@@ -7,8 +7,8 @@
 use crate::abi::RetArea;
 use crate::codegen::{control_index, Callee, CodeGen, ControlStackFrame, FnCall};
 use crate::masm::{
-    DivKind, ExtendKind, FloatCmpKind, IntCmpKind, MacroAssembler, MemMoveDirection, OperandSize,
-    RegImm, RemKind, RoundingMode, SPOffset, ShiftKind, TruncKind,
+    DivKind, ExtendKind, FloatCmpKind, IntCmpKind, MacroAssembler, MemMoveDirection, MulWideKind,
+    OperandSize, RegImm, RemKind, RoundingMode, SPOffset, ShiftKind, TruncKind,
 };
 use crate::reg::{writable, Reg};
 use crate::stack::{TypedReg, Val};
@@ -243,6 +243,10 @@ macro_rules! def_unsupported {
     (emit I64TruncSatF64U $($rest:tt)*) => {};
     (emit V128Load $($rest:tt)*) => {};
     (emit V128Store $($rest:tt)*) => {};
+    (emit I64Add128 $($rest:tt)*) => {};
+    (emit I64Sub128 $($rest:tt)*) => {};
+    (emit I64MulWideS $($rest:tt)*) => {};
+    (emit I64MulWideU $($rest:tt)*) => {};
 
     (emit $unsupported:tt $($rest:tt)*) => {$($rest)*};
 }
@@ -2188,6 +2192,44 @@ where
         );
     }
 
+    fn visit_i64_add128(&mut self) {
+        self.context
+            .binop128(self.masm, |masm, lhs_lo, lhs_hi, rhs_lo, rhs_hi| {
+                masm.add128(
+                    writable!(lhs_lo),
+                    writable!(lhs_hi),
+                    lhs_lo,
+                    lhs_hi,
+                    rhs_lo,
+                    rhs_hi,
+                );
+                (TypedReg::i64(lhs_lo), TypedReg::i64(lhs_hi))
+            });
+    }
+
+    fn visit_i64_sub128(&mut self) {
+        self.context
+            .binop128(self.masm, |masm, lhs_lo, lhs_hi, rhs_lo, rhs_hi| {
+                masm.sub128(
+                    writable!(lhs_lo),
+                    writable!(lhs_hi),
+                    lhs_lo,
+                    lhs_hi,
+                    rhs_lo,
+                    rhs_hi,
+                );
+                (TypedReg::i64(lhs_lo), TypedReg::i64(lhs_hi))
+            });
+    }
+
+    fn visit_i64_mul_wide_s(&mut self) {
+        self.masm.mul_wide(&mut self.context, MulWideKind::Signed);
+    }
+
+    fn visit_i64_mul_wide_u(&mut self) {
+        self.masm.mul_wide(&mut self.context, MulWideKind::Unsigned);
+    }
+
     wasmparser::for_each_operator!(def_unsupported);
 }