diff --git a/src/vhdl/divider32.vhdl b/src/vhdl/divider32.vhdl
index c00189422..9119e91a7 100644
--- a/src/vhdl/divider32.vhdl
+++ b/src/vhdl/divider32.vhdl
@@ -28,33 +28,209 @@ use Std.TextIO.all;
 use work.debugtools.all;
   
 entity divider32 is
+  generic (
+    unit : integer range 0 to 15
+    );
   port (
     clock : in std_logic;
-    unit : in integer range 0 to 15;
     do_add : in std_logic;
+    invert_b : in std_logic;
+    do_mult : in std_logic;
     input_a : in integer range 0 to 15;
     input_b : in integer range 0 to 15;
     input_value_number : in integer range 0 to 15;
     input_value : unsigned(31 downto 0);
-    output_select : in integer range 0 to 15;
-    output_value : out unsigned(63 downto 0)
+    -- output_select : in integer range 0 to 15;
+    mult_shift : in unsigned(2 downto 0);
+    output_value : out unsigned(63 downto 0) := (others => '0')
     );
 end entity;
 
 architecture neo_gregorian of divider32 is
 
-  signal a : signed(31 downto 0) := to_signed(0,32);
-  signal b : signed(31 downto 0) := to_signed(0,32);
-  signal p : signed(63 downto 0) := to_signed(0,64);
+  signal a : unsigned(31 downto 0) := to_unsigned(0,32);
+  signal b : unsigned(31 downto 0) := to_unsigned(0,32);
+  signal p : unsigned(63 downto 0) := to_unsigned(0,64);
+  signal q : unsigned(63 downto 0) := to_unsigned(0,64);
   signal s : unsigned(32 downto 0) := to_unsigned(0,33);
-  
-  signal p1 : signed(63 downto 0);
-  signal p2 : signed(63 downto 0);
-  signal p3 : signed(63 downto 0);
-  signal p4 : signed(63 downto 0);
 
+  signal busy : std_logic := '0';
+  signal start_over : std_logic := '0';
+
+  type state_t is (idle, start_1, start_2, start_3, step_1, step_2, output);
+  signal state : state_t := idle;
+  signal steps_remaining : integer range 0 to 5 := 0;
+
+  signal mult_a : unsigned(67 downto 0) := (others => '0');
+  signal mult_b : unsigned(69 downto 0) := (others => '0');
+  signal mult_signed : std_logic := '0';
+  signal mult_out : unsigned(137 downto 0) := (others => '0');
+
+  signal dd : unsigned(67 downto 0) := to_unsigned(0,68);
+  signal nn : unsigned(67 downto 0) := to_unsigned(0,68);
+
+  pure function count_leading_zeros(arg : unsigned(31 downto 0)) return natural is
+  begin
+    for i in 0 to 31 loop
+      if arg(31-i) = '1' then
+        return i;
+      end if;
+    end loop;
+    return 0;
+  end function count_leading_zeros;
 begin
 
+  process (clock) is
+    variable temp64 : unsigned(73 downto 0) := to_unsigned(0,74);
+    variable temp96 : unsigned(105 downto 0) := to_unsigned(0,106);
+    -- variable temp138 : unsigned(137 downto 0) := to_unsigned(0,138);
+    variable f : unsigned(69 downto 0) := to_unsigned(0,70);
+    variable leading_zeros : natural range 0 to 31;
+    variable new_dd : unsigned( 35 downto 0);
+    variable new_nn : unsigned( 67 downto 0);
+    variable padded_d : unsigned(63 downto 0);
+  begin
+    if rising_edge(clock) then
+      report "state is " & state_t'image(state);
+      -- only for vunit test
+      -- report "q$" & to_hstring(q) & " = n$" & to_hstring(n) & " / d$" & to_hstring(d);
+      if mult_signed = '0' then
+        mult_out <= mult_a * mult_b;
+      else
+        mult_out <= unsigned(signed(mult_a) * signed(mult_b));
+      end if;
+      if start_over = '0' then
+        case state is
+          when idle =>
+            null;
+            -- special startup case to allow for multiplier outputs to settle
+          when start_1 =>
+            -- f = 2 - dd
+            f := to_unsigned(0,70);
+            f(69) := '1';
+            f := f - dd;
+            -- Now multiply both nn and dd by f
+            -- temp138 := nn * f;
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            mult_a <= nn;
+            mult_b <= f;
+            mult_signed <= '0';
+            state <= start_2;
+          when start_2 =>
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            mult_a <= dd; 
+            mult_b <= f;
+            -- multiplier gets set to a * b when start_over is asserted, so store the product.
+            p <= mult_out(137 downto 74);
+            state <= start_3;
+          when start_3 =>
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            mult_a <= nn;
+            mult_b <= f;
+            state <= step_2;
+          when step_1 =>
+            report "nn=$" & to_hstring(nn(67 downto 36)) & "." & to_hstring(nn(35 downto 4)) & "." & to_hstring(nn(3 downto 0))
+              & " / dd=$" & to_hstring(dd(67 downto 36)) & "." & to_hstring(dd(35 downto 4)) & "." & to_hstring(dd(3 downto 0));
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            -- f = 2 - dd
+            -- f := to_unsigned(0,70);
+            -- f(69) := '1';
+            -- f := f - dd;
+            report "f = $" & to_hstring(f);
+
+            -- Check whether to round up
+            if mult_out(67) = '1' then
+              nn <= mult_out(135 downto 68) + 1;
+              mult_a <= mult_out(135 downto 68) + 1;
+            else
+              nn <= mult_out(135 downto 68);
+              mult_a <= mult_out(135 downto 68);
+            end if;
+            -- Now multiply both nn and dd by f
+            -- temp138 := nn * f;
+            mult_b <= f;
+            state <= step_2;
+            -- report "temp138=$" & to_hstring(temp138);
+          when step_2 =>
+            report "nn=$" & to_hstring(nn(67 downto 36)) & "." & to_hstring(nn(35 downto 4)) & "." & to_hstring(nn(3 downto 0))
+              & " / dd=$" & to_hstring(dd(67 downto 36)) & "." & to_hstring(dd(35 downto 4)) & "." & to_hstring(dd(3 downto 0));
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            -- temp138 := dd * f;
+            -- Check whether to round up, but avoid overflow
+            f := to_unsigned(0,70);
+            f(69) := '1';
+            -- f := f - dd;
+            if mult_out(67) = '1' and mult_out(135 downto 68) /= X"FFFFFFFFFFFFFFFFF" then
+              dd <= mult_out(135 downto 68) + 1;
+              mult_a <= mult_out(135 downto 68) + 1;
+              f := f - (mult_out(135 downto 68) + 1);
+            else
+              dd <= mult_out(135 downto 68);
+              mult_a <= mult_out(135 downto 68);
+              f := f - mult_out(135 downto 68);
+            end if;
+            -- report "temp138=$" & to_hstring(temp138);          
+            mult_b <= f;
+            -- Perform number of required steps, or abort early if we can
+            if steps_remaining /= 0 and dd /= x"FFFFFFFFFFFFFFFFF" then
+              steps_remaining <= steps_remaining - 1;
+              state <= step_1;
+            else
+              state <= output;
+            end if;
+          when output =>
+            report "mult_a=$" & to_hstring(mult_a) & ", mult_b=$" & to_hstring(mult_b) & ", mult_out=$" & to_hstring(mult_out);
+            -- No idea why we need to add one, but we do to stop things like 4/2
+            -- giving a result of 1.999999999
+            if mult_out(67) = '1' then
+              temp64(67 downto 0) := mult_out(135 downto 68) + 1;
+            else
+              temp64(67 downto 0) := mult_out(135 downto 68);
+            end if;
+            -- temp64(67 downto 0) := nn;
+            temp64(73 downto 68) := (others => '0');
+            temp64 := temp64 + 8;
+            report "temp64=$" & to_hstring(temp64);
+            busy <= '0';
+            q <= temp64(67 downto 4);
+            state <= idle;
+        end case;
+      end if;
+
+      if start_over='1' and b /= to_unsigned(0,32) then
+        report "Calculating $" & to_hstring(a) & " / $" & to_hstring(b);
+        leading_zeros := count_leading_zeros(b);
+        padded_d := b & X"00000000";
+        new_dd := (others => '0');
+        new_dd(35 downto 4) := padded_d(63-leading_zeros downto 32-leading_zeros);
+        new_nn := (others => '0');
+        new_nn(35+leading_zeros downto 4+leading_zeros) := a;
+        report "Normalised to $" & to_hstring(new_nn(67 downto 36)) & "." &
+          to_hstring(new_nn(35 downto 4)) & "." & to_hstring(new_nn(3 downto 0))
+          & " / $" & to_hstring(new_dd(35 downto 4)) & "." & to_hstring(new_dd(3 downto 0));
+        dd <= new_dd & X"00000000";
+        nn <= new_nn;
+        state <= start_1;
+        steps_remaining <= 5;
+        busy <= '1';
+        -- calculate multiplication
+        mult_a(35 downto 0) <= (others => '0');
+        mult_a(67 downto 36) <= a;
+        mult_b(37 downto 0) <= (others => '0');
+        mult_b(69 downto 38) <= b;
+        mult_signed <= '1';
+      elsif start_over='1' then
+        -- define divide by zero as zero
+        report "Ignoring divide by zero";
+        state <= idle;
+        busy <= '0';
+        q <= (others => '0');
+        -- zero product of a * b, since we know b = 0
+        p <= (others => '0');
+      end if;
+    end if;
+  end process;
+  
   process(clock) is
   begin
     if rising_edge(clock) then
@@ -68,42 +244,51 @@ begin
       if input_value_number = input_a then
 --        report "MATH: Unit #" & integer'image(unit)
 --          & ": Setting a=$" & to_hstring(input_value);
-        a <= signed(input_value);
+        a <= input_value;
+        if a /= input_value or busy = '0' then
+          start_over <= '1';
+        end if;
       end if;
       if input_value_number = input_b then
 --        report "MATH: Unit #" & integer'image(unit)
  --         & ": Setting b=$" & to_hstring(input_value);
-        b <= signed(input_value);
+        if invert_b = '1' then
+          b <= unsigned(-signed(input_value));
+          if b /= unsigned(-signed(input_value)) or busy = '0' then
+            start_over <= '1';
+          end if;
+        else
+          b <= input_value;
+          if b /= input_value or busy = '0' then
+            start_over <= '1';
+          end if;
+        end if;
       end if;
 
-      -- Calculate the result
-      p1 <= a*b;
-      p2 <= p1;
-      p3 <= p2;
-      p4 <= p3;
-      p <= p4;
-      -- Even units do addition, odd ones do subtraction
-      if (unit mod 2) = 0 then
-        s <= to_unsigned(to_integer(a)+to_integer(b),33);
-      else
-        s <= to_unsigned(to_integer(a)-to_integer(b),33);
+      if start_over = '1' then
+        start_over <= '0';
       end if;
 
-      -- Display output value when requested, and tri-state outputs otherwise
-      if output_select = unit then
-        if do_add='1' then
-          -- Output sign-extended 33 bit addition result
-          output_value(63 downto 33) <= (others => s(32));
-          output_value(32 downto 0) <= s;
-          report "MATH: Unit #" & integer'image(unit)
-            & " outputting addition sum $" & to_hstring(s);
-        else
-          output_value <= unsigned(p);
-          report "MATH: Unit #" & integer'image(unit)
-            & " outputting multiplication product $" & to_hstring(unsigned(p));
-        end if;
+      -- Compute sum of inputs
+      s <= unsigned((a(31) & a) + (b(31) & b));
+
+      -- Output result, stored in output register on the CPU side
+      if do_add='1' then
+        -- Output sign-extended 33 bit addition result
+        output_value(63 downto 33) <= (others => s(32));
+        output_value(32 downto 0) <= s;
+        report "MATH: Unit #" & integer'image(unit)
+          & " outputting addition sum $" & to_hstring(s);
+      elsif do_mult = '1' then
+        -- Output product shifted by multiplication shift
+        output_value <= shift_right(p, to_integer(mult_shift & "000"));
+        report "MATH: Unit #" & integer'image(unit)
+          & " outputting multiplication product $" & to_hstring(p);
       else
-        output_value <= (others => 'Z');
+        -- Output quotient and fractional part
+        output_value <= q;
+        report "MATH: Unit #" & integer'image(unit)
+          & " outputting division quotient $" & to_hstring(q);
       end if;
     end if;
   end process;
diff --git a/src/vhdl/gs4510.vhdl b/src/vhdl/gs4510.vhdl
index 77d89b3e2..c2adadcb6 100755
--- a/src/vhdl/gs4510.vhdl
+++ b/src/vhdl/gs4510.vhdl
@@ -37,7 +37,7 @@ use work.victypes.all;
 
 entity gs4510 is
   generic(
-    math_unit_enable : boolean := false;
+    math_unit_enable : boolean := true;
     chipram_1mb : std_logic := '0';
 
     cpufrequency : integer := 40;
@@ -1462,30 +1462,60 @@ architecture Behavioural of gs4510 is
   constant math_unit_count : integer := 16;
   type math_reg_array is array(0 to 15) of unsigned(31 downto 0);
   type math_config_array is array(0 to math_unit_count - 1) of math_unit_config;
+  type math_output_array is array(0 to math_unit_count - 1) of unsigned(63 downto 0);
+  type math_latch_array is array(0 to math_unit_count - 1) of integer range 0 to 15;
   signal reg_math_regs : math_reg_array := (others => to_unsigned(0,32));
   signal reg_math_config : math_config_array := (others => math_unit_config_v);
   signal reg_math_config_drive : math_config_array := (others => math_unit_config_v);
-  signal reg_math_latch_counter : unsigned(7 downto 0) := x"00";
-  signal reg_math_latch_interval : unsigned(7 downto 0) := x"00";
+  -- signal reg_math_latch_counter : unsigned(7 downto 0) := x"00";
+  -- signal reg_math_latch_interval : unsigned(7 downto 0) := x"00";
+  signal math_latch_value : integer range 0 to 15;  -- Latch value to write
+  signal math_latch_address : integer range 0 to 15;  -- Which unit to write latch value to
+  signal math_latch_write_toggle : std_logic := '0';
+  signal last_math_latch_write_toggle : std_logic := '0';
+  signal math_latch_reset_toggle : std_logic := '0';
+  signal last_math_latch_reset_toggle : std_logic := '0';
+  signal reg_math_latch_counters : math_latch_array := (others => 0);
+  signal reg_math_latch_fired : std_logic_vector(15 downto 0) := (others => '0');
+  signal reg_math_latch_intervals : math_latch_array := (others => 0);
+  -- Unit 15 needs to write to the last cycle status instead of current cycle, since
+  -- the register is copied over at the same time.
+  signal math_was_latched_current_cycle : std_logic_vector(14 downto 0) := (others => '0');
+  signal math_was_latched_last_cycle : std_logic_vector(15 downto 0) := (others => '0');
 
   -- We have the output counter out of phase with the input counter, so that we
   -- have time to catch an output, and store it, ready for presenting as an input
   -- very soon after.
-  signal math_input_counter : integer range 0 to 15 := 0;
-  signal math_output_counter : integer range 0 to 15 := 3;
-  signal prev_math_output_counter : integer range 0 to 15 := 2;
+  -- note: for whatever reason the way this was phased meant that the math cycle would
+  -- count up by 1 before math unit 1 had actually output anything
+  constant math_input_counter_init : integer range 0 to 15 := 0;
+  signal math_input_counter : integer range 0 to 15 := math_input_counter_init;
+  constant math_output_counter_init : integer range 0 to 15 := 0;
+  signal math_output_counter : integer range 0 to 15 := math_output_counter_init;  -- originally 3
 
   signal math_input_number : integer range 0 to 15 := 0;
   signal math_input_value : unsigned(31 downto 0) := (others => '0');
-  signal math_output_value_low : unsigned(31 downto 0) := (others => '0');
-  signal math_output_value_high : unsigned(31 downto 0) := (others => '0');
+  signal math_output_values : math_output_array := (others => (others => '0'));
+  -- signal math_output_values_alt : math_alt_output_array := (others => (others => '0'));
+  -- signal math_output_value_low : unsigned(31 downto 0) := (others => 'Z');
+  -- signal math_output_value_high : unsigned(31 downto 0) := (others => 'Z');
 
   -- Start with input and outputting enabled
-  signal math_unit_flags : unsigned(7 downto 0) := x"03";
+  signal math_unit_flags : unsigned(7 downto 0) := x"01";
+  signal math_unit_mult_out_shift : unsigned(2 downto 0) := "000";
+  signal math_unit_less_than : std_logic := '0';
+  signal math_unit_greater_than : std_logic := '0';
+  signal math_unit_equal_to : std_logic := '0';
+  signal math_unit_invert_b : std_logic_vector(15 downto 0) := (others => '0');
+  -- halt math unit when math_unit_halted /= last_math_unit_halted
+  signal math_unit_halted : std_logic := '0';
+  signal last_math_unit_halted : std_logic := '0';
   -- Each write to the math registers is passed to the math unit to handle
   -- (this is to avoid ISE doing really weird things in synthesis, thinking
   -- that each bit of each register was a clock or something similarly odd.)
-  signal reg_math_write : std_logic := '0';
+  -- The reset and write systems need to directly read the toggle state, since
+  -- any intermediate adds enough latency to cause writes on every cycle to fail.
+  -- This is particularly bad with math register writing, since it breaks STQ.
   signal reg_math_write_toggle : std_logic := '0';
   signal last_reg_math_write_toggle : std_logic := '0';
   signal reg_math_regnum : integer range 0 to 15 := 0;
@@ -1494,6 +1524,9 @@ architecture Behavioural of gs4510 is
   -- Count # of math cycles since cycle latch last written to
   signal reg_math_cycle_counter : unsigned(31 downto 0) := to_unsigned(0,32);
   signal reg_math_cycle_counter_plus_one : unsigned(31 downto 0) := to_unsigned(0,32);
+  -- Reset math cycle counters
+  signal reg_math_cycle_counter_reset_toggle : std_logic := '0';
+  signal last_reg_math_cycle_counter_reset_toggle : std_logic := '0';
   -- # of math cycles to trigger end of job / math interrupt
   signal reg_math_cycle_compare : unsigned(31 downto 0) := to_unsigned(0,32);
 
@@ -1601,47 +1634,59 @@ begin
 
 
   multipliers: for unit in 0 to 7 generate
-    mult_unit : entity work.multiply32 port map (
+    mult_unit : entity work.multiply32 generic map (
+      unit => unit
+      ) port map (
       clock => mathclock,
-      unit => unit,
       do_add => reg_math_config_drive(unit).do_add,
+      invert_b => math_unit_invert_b(unit),
       input_a => reg_math_config_drive(unit).source_a,
       input_b => reg_math_config_drive(unit).source_b,
       input_value_number => math_input_number,
       input_value => math_input_value,
-      output_select => math_output_counter,
-      output_value(31 downto 0) => math_output_value_low,
-      output_value(63 downto 32) => math_output_value_high
+      output_shift => math_unit_mult_out_shift,
+      output_value => math_output_values(unit)
+      -- output_select => math_output_counter,
+      -- output_value(31 downto 0) => math_output_value_low,
+      -- output_value(63 downto 32) => math_output_value_high
       );
   end generate;
 
   shifters: for unit in 8 to 11 generate
-    mult_unit : entity work.shifter32 port map (
+    shift_unit : entity work.shifter32 generic map (
+      unit => unit
+      ) port map (
       clock => mathclock,
-      unit => unit,
       do_add => reg_math_config_drive(unit).do_add,
+      invert_b => math_unit_invert_b(unit),
       input_a => reg_math_config_drive(unit).source_a,
       input_b => reg_math_config_drive(unit).source_b,
       input_value_number => math_input_number,
       input_value => math_input_value,
-      output_select => math_output_counter,
-      output_value(31 downto 0) => math_output_value_low,
-      output_value(63 downto 32) => math_output_value_high
+      output_value => math_output_values(unit)
+      -- output_select => math_output_counter,
+      -- output_value(31 downto 0) => math_output_value_low,
+      -- output_value(63 downto 32) => math_output_value_high
       );
   end generate;
 
   dividerrs: for unit in 12 to 15 generate
-    mult_unit : entity work.divider32 port map (
+    div_unit : entity work.divider32 generic map (
+      unit => unit
+      ) port map (
       clock => mathclock,
-      unit => unit,
       do_add => reg_math_config_drive(unit).do_add,
+      do_mult => math_unit_flags(2),
+      invert_b => math_unit_invert_b(unit),
       input_a => reg_math_config_drive(unit).source_a,
       input_b => reg_math_config_drive(unit).source_b,
       input_value_number => math_input_number,
       input_value => math_input_value,
-      output_select => math_output_counter,
-      output_value(31 downto 0) => math_output_value_low,
-      output_value(63 downto 32) => math_output_value_high
+      mult_shift => math_unit_mult_out_shift,
+      output_value => math_output_values(unit)
+      -- output_select => math_output_counter,
+      -- output_value(31 downto 0) => math_output_value_low,
+      -- output_value(63 downto 32) => math_output_value_high
       );
   end generate;
 
@@ -1669,6 +1714,7 @@ begin
     );
 
   process (mathclock)
+    variable math_current_unit_has_latched : std_logic := '0';
   begin
     if rising_edge(mathclock) and math_unit_enable then
       -- For the plumbed math units, we want to avoid having two huge 16x32x32
@@ -1687,62 +1733,126 @@ begin
       -- counters at the CPU speed.
 
       -- Present input value to all math units
-      if math_input_counter /= 15 then
-        math_input_counter <= math_input_counter + 1;
+      -- reset the counter if bit 0 is set (write enabled)
+      if math_unit_flags(0) = '0' and math_unit_halted = last_math_unit_halted then
+        if math_input_counter /= 15 then
+          math_input_counter <= math_input_counter + 1;
+        else
+          math_input_counter <= 0;
+        end if;
+        -- only update the input value and reg when the counter is running
+        -- to prevent register updates while the inputs are offline from messing with
+        -- the math unit's internal registers
+        math_input_number <= math_input_counter;
+        math_input_value <= reg_math_regs(math_input_counter);
+        report "MATH: Presenting math reg #" & integer'image(math_input_counter)
+          &" = $" & to_hstring(reg_math_regs(math_input_counter));
       else
-        math_input_counter <= 0;
+        math_input_counter <= math_input_counter_init;
       end if;
-      math_input_number <= math_input_counter;
-      math_input_value <= reg_math_regs(math_input_counter);
-      report "MATH: Presenting math reg #" & integer'image(math_input_counter)
-        &" = $" & to_hstring(reg_math_regs(math_input_counter));
 
       -- Update output counter being shown to math units
-      if math_output_counter /= 15 then
-        math_output_counter <= math_output_counter + 1;
+      -- reset counters when bit 0 is set (write enabled)
+      if math_unit_flags(0) = '0' and math_unit_halted = last_math_unit_halted then
+        if math_output_counter /= 15 then
+          math_output_counter <= math_output_counter + 1;
+        else
+          math_output_counter <= 0;
+        end if;
       else
-        math_output_counter <= 0;
+        math_output_counter <= math_output_counter_init;
       end if;
-      prev_math_output_counter <= math_output_counter;
+
       -- Based on the configuration for the previously selected unit,
       -- stash the results in the appropriate place
       if true then
-        report "MATH: output flags for unit #" & integer'image(prev_math_output_counter)
+        report "MATH: output flags for unit #" & integer'image(math_output_counter)
           & " = "
-          & std_logic'image(reg_math_config(prev_math_output_counter).output_low) & ", "
-          & std_logic'image(reg_math_config(prev_math_output_counter).output_high) & ", "
-          & integer'image(reg_math_config(prev_math_output_counter).output) & ", "
-          & std_logic'image(reg_math_config(prev_math_output_counter).latched) & ".";
+          & std_logic'image(reg_math_config(math_output_counter).output_low) & ", "
+          & std_logic'image(reg_math_config(math_output_counter).output_high) & ", "
+          & integer'image(reg_math_config(math_output_counter).output) & ", "
+          & std_logic'image(reg_math_config(math_output_counter).latched) & ".";
       end if;
 
-      if math_unit_flags(1) = '1' then
-        if (reg_math_config_drive(prev_math_output_counter).latched='0') or (reg_math_latch_counter = x"00") then
-          if reg_math_config_drive(prev_math_output_counter).output_high = '0' then
-            if reg_math_config_drive(prev_math_output_counter).output_low = '0' then
+      -- Make sure output counter is running before starting to stash outputs, to avoid constantly writing a register
+      -- Math config latch bit indicates whether to treat its latch interval as a counter (unset) or as a unit index (set).
+      if math_unit_flags(1) = '1' and math_unit_flags(0) = '0' and math_unit_halted = last_math_unit_halted then
+        math_current_unit_has_latched := '0';
+        if reg_math_config_drive(math_output_counter).latched = '0' then
+          -- Latched bit unset, use latch interval and counter to determine when to latch.
+          if math_latch_reset_toggle = last_math_latch_reset_toggle then
+            -- Math latches are not resetting, proceed with checks.
+            if reg_math_latch_counters(math_output_counter) = 0 then
+              math_current_unit_has_latched := '1';
+              reg_math_latch_counters(math_output_counter) <= reg_math_latch_intervals(math_output_counter);
+            elsif reg_math_latch_counters(math_output_counter) = 8 then
+              reg_math_latch_fired(math_output_counter) <= '1';
+              if reg_math_latch_fired(math_output_counter) = '0' then
+                math_current_unit_has_latched := '1';
+              end if;
+            else
+              reg_math_latch_counters(math_output_counter) <= reg_math_latch_counters(math_output_counter) - 1;
+            end if;
+          else
+            -- Math latches are resetting, so only latch if the interval to latch on is zero cycles.
+            if reg_math_latch_intervals(math_output_counter) = 0 then
+              math_current_unit_has_latched := '1';
+              reg_math_latch_fired(math_output_counter) <= '0';
+              reg_math_latch_counters(math_output_counter) <= reg_math_latch_intervals(math_output_counter);
+            elsif reg_math_latch_intervals(math_output_counter) = 8 then
+              math_current_unit_has_latched := '1';
+              reg_math_latch_fired(math_output_counter) <= '1';
+              reg_math_latch_counters(math_output_counter) <= reg_math_latch_intervals(math_output_counter);
+            else
+              reg_math_latch_fired(math_output_counter) <= '0';
+              reg_math_latch_counters(math_output_counter) <= reg_math_latch_intervals(math_output_counter) - 1;
+            end if;
+          end if;
+        else
+          -- Latched bit set, use a math unit's previous latch state to determine when to latch.
+          -- When resetting, assume no units were latched last cycle.
+          if math_latch_reset_toggle = last_math_latch_reset_toggle then
+            if math_was_latched_last_cycle(reg_math_latch_intervals(math_output_counter)) = '1' then
+              math_current_unit_has_latched := '1';
+            end if;
+          end if;
+        end if;
+
+        if math_output_counter = 15 then
+          -- Since this is the last unit, no intermediate is required.
+          math_was_latched_last_cycle(15) <= math_current_unit_has_latched;
+        else
+          math_was_latched_current_cycle(math_output_counter) <= math_current_unit_has_latched;
+        end if;
+
+        -- Process output if current unit has latched
+        if math_current_unit_has_latched = '1' then
+          if reg_math_config_drive(math_output_counter).output_high = '0' then
+            if reg_math_config_drive(math_output_counter).output_low = '0' then
               -- No output being kept, so nothing to do.
               null;
             else
               -- Only low output being kept
-              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(prev_math_output_counter).output)
-                & ") from output of math unit #" & integer'image(prev_math_output_counter)
-                & " ( = $" & to_hstring(math_output_value_low) & ")";
-              reg_math_regs(reg_math_config(prev_math_output_counter).output) <= math_output_value_low;
+              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(math_output_counter).output)
+                & ") from output of math unit #" & integer'image(math_output_counter)
+                & " ( = $" & to_hstring(math_output_values(math_output_counter)(31 downto 0)) & ")";
+              reg_math_regs(reg_math_config(math_output_counter).output) <= math_output_values(math_output_counter)(31 downto 0);
             end if;
           else
-            if reg_math_config_drive(prev_math_output_counter).output_low = '0' then
+            if reg_math_config_drive(math_output_counter).output_low = '0' then
               -- Only high half of output is being kept, so stash it
-              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(prev_math_output_counter).output)
-                & ") from output of math unit #" & integer'image(prev_math_output_counter);
-              reg_math_regs(reg_math_config(prev_math_output_counter).output) <= math_output_value_high;
+              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(math_output_counter).output)
+                & ") from output of math unit #" & integer'image(math_output_counter);
+              reg_math_regs(reg_math_config(math_output_counter).output) <= math_output_values(math_output_counter)(63 downto 32);
             else
               -- Both are being stashed, so store in consecutive slots
-              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(prev_math_output_counter).output)
-                & ") (and next) from output of math unit #" & integer'image(prev_math_output_counter);
-              reg_math_regs(reg_math_config(prev_math_output_counter).output) <= math_output_value_low;
-              if reg_math_config_drive(prev_math_output_counter).output /= 15 then
-                reg_math_regs(reg_math_config_drive(prev_math_output_counter).output + 1) <= math_output_value_high;
+              report "MATH: Setting reg_math_regs(" & integer'image(reg_math_config(math_output_counter).output)
+                & ") (and next) from output of math unit #" & integer'image(math_output_counter);
+              reg_math_regs(reg_math_config(math_output_counter).output) <= math_output_values(math_output_counter)(31 downto 0);
+              if reg_math_config_drive(math_output_counter).output /= 15 then
+                reg_math_regs(reg_math_config(math_output_counter).output + 1) <= math_output_values(math_output_counter)(63 downto 32);
               else
-                reg_math_regs(0) <= math_output_value_high;
+                reg_math_regs(0) <= math_output_values(math_output_counter)(63 downto 32);
               end if;
             end if;
           end if;
@@ -1754,11 +1864,19 @@ begin
       -- Implement writing to math registers
       if reg_math_write_toggle /= last_reg_math_write_toggle then
         last_reg_math_write_toggle <= reg_math_write_toggle;
-        reg_math_write <= '1';
       end if;
-      reg_math_write <= '0';
-      if math_unit_flags(0) = '1' then
-        if reg_math_write = '1' then
+
+      if math_latch_write_toggle /= last_math_latch_write_toggle then
+        last_math_latch_write_toggle <= math_latch_write_toggle;
+      end if;
+
+      if reg_math_cycle_counter_reset_toggle /= last_reg_math_cycle_counter_reset_toggle then
+        last_reg_math_cycle_counter_reset_toggle <= reg_math_cycle_counter_reset_toggle;
+      end if;
+
+      -- when math unit has been halted by the comparator, behave as if math_unit_flags(1 downto 0) = "01"
+      if math_unit_flags(0) = '1' or math_unit_halted /= last_math_unit_halted then
+        if reg_math_write_toggle /= last_reg_math_write_toggle then
           case reg_math_regbyte is
             when 0 => reg_math_regs(reg_math_regnum)(7 downto 0) <= reg_math_write_value;
             when 1 => reg_math_regs(reg_math_regnum)(15 downto 8) <= reg_math_write_value;
@@ -1767,25 +1885,66 @@ begin
             when others =>
           end case;
         end if;
+        if math_latch_write_toggle /= last_math_latch_write_toggle then
+          reg_math_latch_intervals(math_latch_address) <= math_latch_value;
+          reg_math_latch_counters(math_latch_address) <= math_latch_value;
+        end if;
       end if;
 
       -- Latch counter counts "math cycles", which is the time it takes for an
       -- output to appear on the inputs again, i.e., once per lap of the input
       -- and output propagation.
       -- TODO: implement reg_math_cycle_counter_reset signal, see D7E1
-      reg_math_cycle_counter_plus_one <= reg_math_cycle_counter + 1;
-      if math_output_counter = 1 then
-        -- Decrement latch counter
-        if reg_math_latch_counter = x"00" then
-          reg_math_latch_counter <= reg_math_latch_interval;
-          -- And update math cycle counter, if math unit is active
-          if math_unit_flags(1) = '1' then
-            reg_math_cycle_counter <= reg_math_cycle_counter_plus_one;
+      if math_output_counter = (15 + math_output_counter_init) mod 16 then
+        -- If a bit is set in math_was_latched_current_cycle, then that unit reset its latch counter and wrote an output.
+        -- For sequential latching, a unit needs to know the latch status from the previous cycle,
+        -- so the FPU needs to store which units latched on this cycle.
+        -- Unit 15 is an exception, since when this code runs, it still hasn't finished processing.
+        -- In order to avoid weird glitchiness with 15, it will write to the last cycle reg directly.
+        math_was_latched_last_cycle(14 downto 0) <= math_was_latched_current_cycle;
+        -- All units have been cycled through, so no more resetting to do.
+        if math_latch_reset_toggle /= last_math_latch_reset_toggle then
+          last_math_latch_reset_toggle <= math_latch_reset_toggle;
+        end if;
+        -- Update math cycle counter, if math unit is active
+        -- include a case for the reset, to avoid a possible edge case resulting in a double-drive
+        if math_unit_flags(1) = '1' and reg_math_cycle_counter_reset_toggle = last_reg_math_cycle_counter_reset_toggle and math_unit_halted = last_math_unit_halted then
+          if reg_math_cycle_counter_plus_one = reg_math_cycle_compare then
+            math_unit_halted <= not last_math_unit_halted;  -- disable calculation, enable writing to regs from CPU (disables counters)
           end if;
-        else
-          reg_math_latch_counter <= reg_math_latch_counter - 1;
+          reg_math_cycle_counter <= reg_math_cycle_counter_plus_one;
         end if;
       end if;
+
+      -- handle resetting the cycle counter, as well as updating reg_math_cycle_counter_plus_one, to avoid a multiple drive situation
+      if reg_math_cycle_counter_reset_toggle /= last_reg_math_cycle_counter_reset_toggle then
+        reg_math_cycle_counter <= (others => '0');
+        -- reg_math_cycle_counter_plus_one <= x"00000001";
+      end if;
+      reg_math_cycle_counter_plus_one <= reg_math_cycle_counter + 1;
+
+      -- We also provide some flags (which will later trigger interrupts) based
+      -- on the equality of math registers 14 and 15
+      math_unit_flags(6) <= math_unit_equal_to;
+      math_unit_flags(5) <= math_unit_less_than;
+      math_unit_flags(4) <= math_unit_greater_than;
+      if reg_math_regs(14) = reg_math_regs(15) then
+        math_unit_equal_to <= '1';
+      else
+        math_unit_equal_to <= '0';
+      end if;
+      if reg_math_regs(14) < reg_math_regs(15) then
+        math_unit_less_than <= '1';
+      else
+        math_unit_less_than <= '0';
+      end if;
+      if reg_math_regs(14) > reg_math_regs(15) then
+        math_unit_greater_than <= '1';
+      else
+        math_unit_greater_than <= '0';
+      end if;
+      -- temp, maybe use $D7E1.7 as an interrupt indicate later?
+      math_unit_flags(7) <= '0';
     end if;
   end process;
 
@@ -2988,10 +3147,21 @@ begin
                 &to_unsigned(reg_math_config(to_integer(the_read_address(3 downto 0))).output,4);
               -- @IO:GS $D7E0 MATH:LATCHINT Latch interval for latched outputs (in CPU cycles)
               -- $D7E1 is documented higher up
-            when x"E0" => return reg_math_latch_interval;
-            when x"E1" => return math_unit_flags;
+            when x"E0" => return to_unsigned(math_latch_address,4) & to_unsigned(reg_math_latch_intervals(math_latch_address),4);
+            when x"E1" =>
+              if math_unit_flags(3) = '0' then
+                if math_unit_halted = last_math_unit_halted then
+                  return math_unit_flags;
+                else
+                  return math_unit_flags(7 downto 2) & "01";
+                end if;
+              else
+                return math_unit_flags(7 downto 3) & math_unit_mult_out_shift(2 downto 0);
+              end if;
             -- @IO:GS $D7E2 MATH:RESERVED Reserved
             -- @IO:GS $D7E3 MATH:RESERVED Reserved
+            when x"E2" => return unsigned(math_unit_invert_b(7 downto 0));
+            when x"E3" => return unsigned(math_unit_invert_b(15 downto 8));
             --@IO:GS $D7E4 MATH:ITERCNT Iteration Counter (32 bit)
             --@IO:GS $D7E5 MATH:ITERCNT Iteration Counter (32 bit)
             --@IO:GS $D7E6 MATH:ITERCNT Iteration Counter (32 bit)
@@ -3575,13 +3745,32 @@ begin
           reg_math_config(to_integer(long_address(3 downto 0))).output <= to_integer(value(3 downto 0));
         elsif long_address(7 downto 0) = x"E0" then
           -- @IO:GS $D7E0 - Math unit latch interval (only update output of math function units every this many cycles, if they have the latch output flag set)
-          reg_math_latch_interval <= value;
+          math_latch_address <= to_integer(value(7 downto 4));
+          math_latch_value <= to_integer(value(3 downto 0));
+          math_latch_write_toggle <= not math_latch_write_toggle;
         elsif long_address(7 downto 0) = x"E1" then
           -- @IO:GS $D7E1 - Math unit general settings (writing also clears math cycle counter)
           -- @IO:GS $D7E1.0 MATH:WREN Enable setting of math registers (must normally be set)
           -- @IO:GS $D7E1.1 MATH:CALCEN Enable committing of output values from math units back to math registers (clearing effectively pauses iterative formulae)
-          math_unit_flags <= value;
+          math_unit_flags(3) <= value(3);
+          if value(3) = '1' then
+            if math_unit_flags(0) = '1' then
+              math_unit_mult_out_shift <= value(2 downto 0);
+            elsif math_unit_halted /= last_math_unit_halted then
+              math_unit_mult_out_shift <= value(2 downto 0);
+              math_unit_flags(1 downto 0) <= "01";  -- reset flags to halted state, since halted state is cleared.
+            end if;
+          else
+            math_unit_flags(2 downto 0) <= value(2 downto 0);
+          end if;
           -- reg_math_cycle_counter <= to_unsigned(0,32); -- TODO: Should generate a reg_math_cycle_counter_reset signal
+          reg_math_cycle_counter_reset_toggle <= not reg_math_cycle_counter_reset_toggle;
+          math_latch_reset_toggle <= not last_math_latch_reset_toggle;
+          last_math_unit_halted <= math_unit_halted;
+        elsif long_address(7 downto 0) = x"E2" then
+          math_unit_invert_b(7 downto 0) <= std_logic_vector(value);
+        elsif long_address(7 downto 0) = x"E3" then
+          math_unit_invert_b(15 downto 8) <= std_logic_vector(value);
         elsif long_address(7 downto 0) = x"E8" then
           reg_math_cycle_compare(7 downto 0) <= value;
         elsif long_address(7 downto 0) = x"E9" then
@@ -4243,39 +4432,6 @@ begin
       else
         chipselect_enables <= x"EF";
       end if;
-
-      if math_unit_enable then
-        -- We also provide some flags (which will later trigger interrupts) based
-        -- on the equality of math registers 14 and 15
-        if reg_math_regs(14) = reg_math_regs(15) then
-          math_unit_flags(6) <= '1';
-          if math_unit_flags(3 downto 2) = "00" then
-            math_unit_flags(7) <= '1' ;
-          end if;
-        else
-          math_unit_flags(6) <= '0';
-          if math_unit_flags(3 downto 2) = "11" then
-            math_unit_flags(7) <= '1' ;
-          end if;
-        end if;
-        if reg_math_regs(14) < reg_math_regs(15) then
-          math_unit_flags(5) <= '1';
-          if math_unit_flags(3 downto 2) = "10" then
-            math_unit_flags(7) <= '1' ;
-          end if;
-        else
-          math_unit_flags(5) <= '0';
-        end if;
-        if reg_math_regs(14) > reg_math_regs(15) then
-          math_unit_flags(4) <= '1';
-          if math_unit_flags(3 downto 2) = "01" then
-            math_unit_flags(7) <= '1' ;
-          end if;
-        else
-          math_unit_flags(4) <= '0';
-        end if;
-      end if;
-
     end if;
 
     -- BEGINNING OF MAIN PROCESS FOR CPU
diff --git a/src/vhdl/multiply32.vhdl b/src/vhdl/multiply32.vhdl
index 58a612941..5221dcebf 100644
--- a/src/vhdl/multiply32.vhdl
+++ b/src/vhdl/multiply32.vhdl
@@ -28,16 +28,20 @@ use Std.TextIO.all;
 use work.debugtools.all;
   
 entity multiply32 is
+  generic (
+    unit : integer range 0 to 15
+    );
   port (
     clock : in std_logic;
-    unit : in integer range 0 to 15;
     do_add : in std_logic;
+    invert_b : in std_logic;
     input_a : in integer range 0 to 15;
     input_b : in integer range 0 to 15;
     input_value_number : in integer range 0 to 15;
-    input_value : unsigned(31 downto 0);
-    output_select : in integer range 0 to 15;
-    output_value : out unsigned(63 downto 0)
+    input_value : in unsigned(31 downto 0);
+    -- output_select : in integer range 0 to 15;
+    output_shift : in unsigned(2 downto 0); 
+    output_value : out unsigned(63 downto 0) := (others => '0')
     );
 end entity;
 
@@ -73,7 +77,11 @@ begin
       if input_value_number = input_b then
 --        report "MATH: Unit #" & integer'image(unit)
 --          & ": Setting b=$" & to_hstring(input_value);
-        b <= signed(input_value);
+        if invert_b = '1' then
+          b <= -signed(input_value);
+        else
+          b <= signed(input_value);
+        end if;
       end if;
 
       -- Calculate the result
@@ -82,28 +90,22 @@ begin
       p3 <= p2;
       p4 <= p3;
       p <= p4;
-      -- Even units do addition, odd ones do subtraction
-      if (unit mod 2) = 0 then
-        s <= to_unsigned(to_integer(a)+to_integer(b),33);
-      else
-        s <= to_unsigned(to_integer(a)-to_integer(b),33);
-      end if;
 
-      -- Display output value when requested, and tri-state outputs otherwise
-      if output_select = unit then
-        if do_add='1' then
-          -- Output sign-extended 33 bit addition result
-          output_value(63 downto 33) <= (others => s(32));
-          output_value(32 downto 0) <= s;
-          report "MATH: Unit #" & integer'image(unit)
-            & " outputting addition sum $" & to_hstring(s);
-        else
-          output_value <= unsigned(p);
---          report "MATH: Unit #" & integer'image(unit)
---            & " outputting multiplication product $" & to_hstring(unsigned(p));
-        end if;
+      -- Calculate sum of inputs
+      s <= unsigned((a(31) & a)+(b(31) & b));
+
+      -- Output result, stored in output register on the CPU side
+      if do_add='1' then
+        -- Output sign-extended 33 bit addition result
+        output_value(63 downto 33) <= (others => s(32));
+        output_value(32 downto 0) <= s;
+        -- report "MATH: Unit #" & integer'image(unit)
+        --   & " outputting addition sum $" & to_hstring(s);
       else
-        output_value <= (others => 'Z');
+        -- Output product shifted by the output shift
+        output_value <= shift_right(unsigned(p), to_integer(output_shift & "000"));
+        -- report "MATH: Unit #" & integer'image(unit)
+        --   & " outputting multiplication product $" & to_hstring(unsigned(p));
       end if;
     end if;
   end process;
diff --git a/src/vhdl/shifter32.vhdl b/src/vhdl/shifter32.vhdl
index dace057d8..efc4ad0cc 100644
--- a/src/vhdl/shifter32.vhdl
+++ b/src/vhdl/shifter32.vhdl
@@ -28,16 +28,19 @@ use Std.TextIO.all;
 use work.debugtools.all;
   
 entity shifter32 is
+  generic (
+    unit : integer range 0 to 15
+    );
   port (
     clock : in std_logic;
-    unit : in integer range 0 to 15;
     do_add : in std_logic;
+    invert_b : in std_logic;
     input_a : in integer range 0 to 15;
     input_b : in integer range 0 to 15;
     input_value_number : in integer range 0 to 15;
     input_value : unsigned(31 downto 0);
-    output_select : in integer range 0 to 15;
-    output_value : out unsigned(63 downto 0)
+    -- output_select : in integer range 0 to 15;
+    output_value : out unsigned(63 downto 0) := (others => '0')
     );
 end entity;
 
@@ -59,16 +62,15 @@ begin
         a <= input_value;
       end if;
       if input_value_number = input_b then
-        b <= input_value;
+        if invert_b = '1' then
+          b <= unsigned(-signed(input_value));
+        else
+          b <= input_value;
+        end if;
       end if;
 
-      -- Calculate the result
-      -- Even units do addition, odd ones do subtraction
-      if (unit mod 2) = 0 then
-        s <= to_unsigned(to_integer(a)+to_integer(b),33);
-      else
-        s <= to_unsigned(to_integer(a)-to_integer(b),33);
-      end if;
+      -- Calculate sum of inputs
+      s <= unsigned((a(31) & a)+(b(31) & b));
 
       if b(7 downto 0) = x"00" then
         p(63 downto 32) <= (others => '0');
@@ -86,17 +88,14 @@ begin
         end if;
       end if;
 
-      -- Display output value when requested, and tri-state outputs otherwise
-      if output_select = unit then
-        if do_add='1' then
-          -- Output sign-extended 33 bit addition result
-          output_value(63 downto 33) <= (others => s(32));
-          output_value(32 downto 0) <= s;
-        else
-          output_value <= p;
-        end if;
+      -- Output result, stored in output register on the CPU side
+      if do_add='1' then
+        -- Output sign-extended 33 bit addition result
+        output_value(63 downto 33) <= (others => s(32));
+        output_value(32 downto 0) <= s;
       else
-        output_value <= (others => 'Z');
+        -- Output shifted result
+        output_value <= p;
       end if;
     end if;
   end process;