diff --git a/firmware/fpga/board.py b/firmware/fpga/board.py
index 9185f8725..d5c223615 100644
--- a/firmware/fpga/board.py
+++ b/firmware/fpga/board.py
@@ -37,14 +37,16 @@ class PralinePlatform(LatticeICE40Platform):
             Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("host_data", 0, Pins("21 19 6 13 10 3 4 18", dir="io"),
             Attrs(IO_STANDARD="SB_LVCMOS")),
-        Resource("q_invert", 0, Pins("9", dir="i"),
-            Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("direction", 0, Pins("12", dir="i"),
             Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("disable", 0, Pins("23", dir="i"),
             Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("capture_en", 0, Pins("11", dir="o"),
             Attrs(IO_STANDARD="SB_LVCMOS")),
+
+        # Other I/O.
+        Resource("q_invert", 0, Pins("9", dir="i"),
+            Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("trigger_in", 0, Pins("48", dir="i"),
             Attrs(IO_STANDARD="SB_LVCMOS")),
         Resource("trigger_out", 0, Pins("2", dir="o"),
diff --git a/firmware/fpga/build/praline_fpga.bin b/firmware/fpga/build/praline_fpga.bin
index 8279693b8..460d2552f 100644
Binary files a/firmware/fpga/build/praline_fpga.bin and b/firmware/fpga/build/praline_fpga.bin differ
diff --git a/firmware/fpga/dsp/fir.py b/firmware/fpga/dsp/fir.py
index 0faeda8b1..5b8a67e27 100644
--- a/firmware/fpga/dsp/fir.py
+++ b/firmware/fpga/dsp/fir.py
@@ -7,7 +7,7 @@
 from math                   import ceil, log2
 
 from amaranth               import Module, Signal, Mux, DomainRenamer
-from amaranth.lib           import wiring, stream, data, memory
+from amaranth.lib           import wiring, stream, data, memory, fifo
 from amaranth.lib.wiring    import In, Out
 from amaranth.utils         import bits_for
 
@@ -58,6 +58,17 @@ def elaborate(self, platform):
         # Arms
         m.submodules.fir = fir = FIRFilter(fir_taps, shape=self.data_shape, always_ready=always_ready, 
             num_channels=1, add_tap=len(fir_taps)//2+1)
+        fir_out_odd = Signal()
+        with m.If(fir.output.valid & fir.output.ready):
+            m.d.sync += fir_out_odd.eq(~fir_out_odd)
+
+        odd = Signal()
+        with m.If(self.input.valid & self.input.ready):
+            m.d.sync += odd.eq(~odd)
+
+        # Only switch modes at even samples.
+        switch_stb = Signal()
+        m.d.comb += switch_stb.eq((~odd) ^ (self.input.valid & self.input.ready))
 
         with m.FSM():
 
@@ -70,72 +81,54 @@ def elaborate(self, platform):
                     if not self.input.signature.always_ready:
                         m.d.comb += self.input.ready.eq(1)
 
-                with m.If(self.enable):
+                with m.If(self.enable & switch_stb):
                     m.next = "DECIMATE"
 
             with m.State("DECIMATE"):
 
-                # Input switching.
-                odd         = Signal()
-                input_idx   = Signal()
-                even_valid  = Signal()
+                # I and Q channels are muxed in time, demuxed later in the output stage.
                 even_buffer = Signal.like(self.input.p)
-                q_inputs    = Signal.like(self.input.p)
+                odd_buffer = Signal.like(self.input.p)
+                q_valid = Signal()
 
                 if not self.input.signature.always_ready:
-                    m.d.comb += self.input.ready.eq((~odd & ~even_valid) | fir.input.ready)
+                    m.d.comb += self.input.ready.eq(fir.input.ready)
 
-                # Even samples are buffered and used as a secondary 
-                # carry addition for the FIR filter.
-                # I and Q channels are muxed in time, demuxed later in the output stage.
-                with m.If(self.input.valid & self.input.ready):
-                    m.d.sync += odd.eq(~odd)
-                with m.If(~odd):
-                    with m.If(~even_valid | fir.input.ready):
-                        m.d.sync += even_valid.eq(self.input.valid)
-                        with m.If(self.input.valid):
-                            m.d.sync += even_buffer.eq(self.input.p)
-
-                # Process two I samples and two Q samples in sequence.
-                with m.If(fir.input.ready & fir.input.valid):
-                    m.d.sync += input_idx.eq(input_idx ^ 1)
-
-                with m.If(input_idx == 0):
+                with m.If(self.input.ready & self.input.valid):
+                    with m.If(~odd):
+                        m.d.sync += even_buffer.eq(self.input.p)
+                    with m.Else():
+                        m.d.sync += odd_buffer.eq(self.input.p)
+                        m.d.sync += q_valid.eq(1)
+
+                with m.If(odd):
                     m.d.comb += [
                         fir.add_input   .eq(even_buffer[0]),
                         fir.input.p     .eq(self.input.p[0]),
-                        fir.input.valid .eq(self.input.valid & even_valid),
+                        fir.input.valid .eq(self.input.valid),
                     ]
-                    with m.If(fir.input.ready & fir.input.valid):
-                        m.d.sync += [
-                            q_inputs[0].eq(even_buffer[1]),
-                            q_inputs[1].eq(self.input.p[1]),
-                        ]
                 with m.Else():
                     m.d.comb += [
-                        fir.add_input   .eq(q_inputs[0]),
-                        fir.input.p     .eq(q_inputs[1]),
-                        fir.input.valid .eq(1),
+                        fir.add_input   .eq(even_buffer[1]),
+                        fir.input.p     .eq(odd_buffer[1]),
+                        fir.input.valid .eq(q_valid),
                     ]
+                    with m.If(fir.input.ready):
+                        m.d.sync += q_valid.eq(0)
 
                 # Output sum and demux.
-                output_idx = Signal()
-
                 with m.If(~self.output.valid | self.output.ready):
                     if not fir.output.signature.always_ready:
                         m.d.comb += fir.output.ready.eq(1)
-                    m.d.sync += self.output.valid.eq(fir.output.valid & output_idx)
+                    m.d.sync += self.output.valid.eq(fir.output.valid & fir_out_odd)
                     with m.If(fir.output.valid):
                         m.d.sync += self.output.p[0].eq(self.output.p[1])
                         m.d.sync += self.output.p[1].eq(fir.output.p[0] * fixed.Const(0.5))
-                        m.d.sync += output_idx.eq(output_idx ^ 1)
 
-                # Mode switch logic.
-                with m.If(~self.enable):
-                    m.d.sync += input_idx.eq(0)
-                    m.d.sync += output_idx.eq(0)
-                    m.d.sync += odd.eq(0)
-                    m.d.sync += even_valid.eq(0)
+                # Mode switch logic
+                with m.If(~self.enable & switch_stb):
+                    m.d.sync += even_buffer.eq(0)
+                    m.d.sync += odd_buffer.eq(0)
                     m.next = "BYPASS"
 
         if self._domain != "sync":
@@ -180,9 +173,17 @@ def elaborate(self, platform):
         delay     = arm1_taps.index(1)
 
         # Arms
-        m.submodules.fir0 = fir0 = FIRFilter(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, always_ready=always_ready, num_channels=self.num_channels)
-        m.submodules.fir1 = fir1 = Delay(delay, shape=self.data_shape, always_ready=always_ready, num_channels=self.num_channels)
-        arms = [fir0, fir1]
+        m.submodules.fir = fir = FIRFilter(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, always_ready=always_ready, num_channels=self.num_channels)
+        m.submodules.dly = dly = Delay(delay, shape=self.data_shape, always_ready=always_ready, num_channels=self.num_channels)
+        m.submodules.dly_fifo = dly_fifo = fifo.SyncFIFOBuffered(width=self.num_channels*self.data_shape.as_shape().width, depth=1)
+        arms = [fir, dly]
+
+        m.d.comb += [
+            dly_fifo.w_data.eq(dly.output.p),
+            dly_fifo.w_en.eq(dly.output.valid),
+        ]
+        if not dly.output.signature.always_ready:
+            m.d.comb += dly.output.ready.eq(dly_fifo.w_rdy)
 
         with m.FSM():
 
@@ -205,7 +206,6 @@ def elaborate(self, platform):
                     m.next = "BYPASS"
 
                 # Input
-
                 for i, arm in enumerate(arms):
                     m.d.comb += arm.input.payload.eq(self.input.payload)
                     m.d.comb += arm.input.valid.eq(self.input.valid & arms[i^1].input.ready)
@@ -218,29 +218,25 @@ def elaborate(self, platform):
                 arm_index = Signal()
 
                 # Output buffers for each arm.
-                arm_outputs = [arm.output for arm in arms]
-                if self.output.signature.always_ready:
-                    buffers = [stream.Signature(arm.payload.shape()).create() for arm in arm_outputs]
-                    for arm, buf in zip(arm_outputs, buffers):
-                        with m.If(~buf.valid | buf.ready):
-                            if not arm.signature.always_ready:
-                                m.d.comb += arm.ready.eq(1)
-                            m.d.sync += buf.valid.eq(arm.valid)
-                            with m.If(arm.valid):
-                                m.d.sync += buf.payload.eq(arm.payload)
-                    arm_outputs = buffers
+                r_data_cast = data.ArrayLayout(self.data_shape, self.num_channels)(dly_fifo.r_data)
 
                 with m.If(~self.output.valid | self.output.ready):
                     with m.Switch(arm_index):
-                        for i, arm in enumerate(arm_outputs):
-                            with m.Case(i):
-                                for c in range(self.num_channels):
-                                    m.d.sync += self.output.payload[c].eq(arm.payload[c])
-                                m.d.sync += self.output.valid.eq(arm.valid)
-                                if not arm.signature.always_ready:
-                                    m.d.comb += arm.ready.eq(1)
-                                with m.If(arm.valid):
-                                    m.d.sync += arm_index.eq(arm_index ^ 1)
+                        with m.Case(0):
+                            for c in range(self.num_channels):
+                                m.d.sync += self.output.payload[c].eq(fir.output.payload[c])
+                            m.d.sync += self.output.valid.eq(fir.output.valid)
+                            if not fir.output.signature.always_ready:
+                                m.d.comb += fir.output.ready.eq(1)
+                            with m.If(fir.output.valid):
+                                m.d.sync += arm_index.eq(1)
+                        with m.Case(1):
+                            for c in range(self.num_channels):
+                                m.d.sync += self.output.payload[c].eq(r_data_cast[c])
+                            m.d.sync += self.output.valid.eq(dly_fifo.r_rdy)
+                            m.d.comb += dly_fifo.r_en.eq(1)
+                            with m.If(dly_fifo.r_rdy):
+                                m.d.sync += arm_index.eq(0)
 
         if self._domain != "sync":
             m = DomainRenamer(self._domain)(m)
@@ -446,24 +442,26 @@ def _generate_samples(self, count, width, f_width=0):
             return samples / (1 << f_width)
         return samples
 
-    def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0):
+    def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0, empty_ready_cycles=0):
 
         async def input_process(ctx):
             if hasattr(dut, "enable"):
                 ctx.set(dut.enable, 1)
-                await ctx.tick()
-            ctx.set(dut.input.valid, 1)
-            for sample in samples:
+            await ctx.tick()
+
+            for i, sample in enumerate(samples):
                 if num_channels > 1:
                     ctx.set(dut.input.payload, [s.item() for s in sample])
                 else:
-                    ctx.set(dut.input.payload, [sample.item()])
+                    if isinstance(dut.input.payload.shape(), data.ArrayLayout):
+                        ctx.set(dut.input.payload, [sample.item()])
+                    else:
+                        ctx.set(dut.input.payload, sample.item())
+                ctx.set(dut.input.valid, 1)
                 await ctx.tick().until(dut.input.ready)
+                ctx.set(dut.input.valid, 0)
                 if empty_cycles > 0:
-                    ctx.set(dut.input.valid, 0)
                     await ctx.tick().repeat(empty_cycles)
-                    ctx.set(dut.input.valid, 1)
-            ctx.set(dut.input.valid, 0)
 
         filtered = []
         async def output_process(ctx):
@@ -474,7 +472,14 @@ async def output_process(ctx):
                 if num_channels > 1:
                     filtered.append([v.as_float() for v in payload])
                 else:
-                    filtered.append(payload[0].as_float())
+                    if isinstance(payload.shape(), data.ArrayLayout):
+                        filtered.append(payload[0].as_float())
+                    else:
+                        filtered.append(payload.as_float())
+                if empty_ready_cycles > 0:
+                    ctx.set(dut.output.ready, 0)
+                    await ctx.tick().repeat(empty_ready_cycles)
+                    ctx.set(dut.output.ready, 1)
             if not dut.output.signature.always_ready:
                 ctx.set(dut.output.ready, 0)
 
@@ -505,100 +510,154 @@ def test_filter(self):
         filtered_np = np.convolve(input_samples, taps).tolist()
 
         # Simulate DUT
-        dut = FIRFilter(taps, fixed.SQ(15, 0), always_ready=True)
-        filtered = self._filter(dut, input_samples, len(input_samples))
+        dut = FIRFilter(taps, shape=fixed.SQ(8, 0), always_ready=False)
+        filtered = self._filter(dut, input_samples, len(input_samples), empty_ready_cycles=5)
 
         self.assertListEqual(filtered_np[:len(filtered)], filtered)
 
 
 class TestHalfBandDecimator(_TestFilter):
 
-    def test_filter_no_backpressure(self):
-        taps = [-1, 0, 9, 16, 9, 0, -1]
-        taps = [ tap / 32 for tap in taps ]
-
-        num_samples = 1024
-        input_width = 8
-        samples_i_in = self._generate_samples(num_samples, input_width, f_width=7)
-        samples_q_in = self._generate_samples(num_samples, input_width, f_width=7)
-
-        # Compute the expected result
-        filtered_i_np = np.convolve(samples_i_in, taps)[1::2].tolist()
-        filtered_q_np = np.convolve(samples_q_in, taps)[1::2].tolist()
-
-        # Simulate DUT
-        dut = HalfBandDecimator(taps, data_shape=fixed.SQ(7), shape_out=fixed.SQ(0,16), always_ready=True)
-        filtered = self._filter(dut, zip(samples_i_in, samples_q_in), len(samples_i_in) // 2, num_channels=2)
-        filtered_i = [ x[0] for x in filtered ]
-        filtered_q = [ x[1] for x in filtered ]
-
-        self.assertListEqual(filtered_i_np[:len(filtered_i)], filtered_i)
-        self.assertListEqual(filtered_q_np[:len(filtered_q)], filtered_q)
-
-    def test_filter_with_spare_cycles(self):
-        taps = [-1, 0, 9, 16, 9, 0, -1]
-        taps = [ tap / 32 for tap in taps ]
-
-        num_samples = 1024
-        input_width = 8
-        samples_i_in = self._generate_samples(num_samples, input_width, f_width=7)
-        samples_q_in = self._generate_samples(num_samples, input_width, f_width=7)
+    def test_filter(self):
 
-        # Compute the expected result
-        filtered_i_np = np.convolve(samples_i_in, taps)[1::2].tolist()
-        filtered_q_np = np.convolve(samples_q_in, taps)[1::2].tolist()
+        common_dut_options = dict(
+            data_shape=fixed.SQ(7),
+            shape_out=fixed.SQ(0,31),
+        )
 
-        # Simulate DUT
-        dut = HalfBandDecimator(taps, data_shape=fixed.SQ(7), shape_out=fixed.SQ(0,16), always_ready=True)
-        filtered = self._filter(dut, zip(samples_i_in, samples_q_in), len(samples_i_in) // 2, num_channels=2, empty_cycles=3)
-        filtered_i = [ x[0] for x in filtered ]
-        filtered_q = [ x[1] for x in filtered ]
+        taps0 = (np.array([-1, 0, 9, 16, 9, 0, -1]) / 32).tolist()
+        taps1 = (np.array([-2, 0, 7, 0, -18, 0, 41, 0, -92, 0, 320, 512, 320, 0, -92, 0, 41, 0, -18, 0, 7, 0, -2]) / 1024).tolist()
+
+
+        inputs = {
+
+            "test_filter_with_backpressure": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=False, taps=taps0),
+                "sim_opts": dict(empty_cycles=0),
+            },
+
+            "test_filter_with_backpressure_and_empty_cycles": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=False, taps=taps0),
+                "sim_opts": dict(empty_cycles=3),
+            },
+
+            "test_filter_with_backpressure_taps1": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=False, taps=taps1),
+                "sim_opts": dict(empty_cycles=0),
+            },
+
+            "test_filter_no_backpressure_and_empty_cycles_taps1": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=True, taps=taps0),
+                "sim_opts": dict(empty_cycles=6),
+            },
+
+            "test_filter_no_backpressure": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=True, taps=taps1),
+                "sim_opts": dict(empty_cycles=3),
+            },
+        }
+        
+        for name, scenario in inputs.items():
 
-        self.assertListEqual(filtered_i_np[:len(filtered_i)], filtered_i)
-        self.assertListEqual(filtered_q_np[:len(filtered_q)], filtered_q)
+            with self.subTest(name):
+                taps        = scenario["dut_options"]["taps"]
+                num_samples = scenario["num_samples"]
 
-    def test_filter_with_backpressure(self):
-        taps = [-1, 0, 9, 16, 9, 0, -1]
-        taps = [ tap / 32 for tap in taps ]
+                input_width = 8
+                samples_i_in = self._generate_samples(num_samples, input_width, f_width=7)
+                samples_q_in = self._generate_samples(num_samples, input_width, f_width=7)
 
-        num_samples = 1024
-        input_width = 8
-        samples_i_in = self._generate_samples(num_samples, input_width, f_width=7)
-        samples_q_in = self._generate_samples(num_samples, input_width, f_width=7)
+                # Compute the expected result
+                filtered_i_np = np.convolve(samples_i_in, taps)[1::2].tolist()
+                filtered_q_np = np.convolve(samples_q_in, taps)[1::2].tolist()
 
-        # Compute the expected result
-        filtered_i_np = np.convolve(samples_i_in, taps)[1::2].tolist()
-        filtered_q_np = np.convolve(samples_q_in, taps)[1::2].tolist()
+                # Simulate DUT
+                dut = HalfBandDecimator(**scenario["dut_options"])
+                filtered = self._filter(dut, zip(samples_i_in, samples_q_in), len(samples_i_in) // 2, num_channels=2, **scenario["sim_opts"])
+                filtered_i = [ x[0] for x in filtered ]
+                filtered_q = [ x[1] for x in filtered ]
 
-        # Simulate DUT
-        dut = HalfBandDecimator(taps, data_shape=fixed.SQ(7), shape_out=fixed.SQ(0,16), always_ready=False)
-        filtered = self._filter(dut, zip(samples_i_in, samples_q_in), len(samples_i_in) // 2, num_channels=2)
-        filtered_i = [ x[0] for x in filtered ]
-        filtered_q = [ x[1] for x in filtered ]
+                self.assertListEqual(filtered_i_np[:len(filtered_i)], filtered_i)
+                self.assertListEqual(filtered_q_np[:len(filtered_q)], filtered_q)
 
-        self.assertListEqual(filtered_i_np[:len(filtered_i)], filtered_i)
-        self.assertListEqual(filtered_q_np[:len(filtered_q)], filtered_q)
 
 class TestHalfBandInterpolator(_TestFilter):
 
     def test_filter(self):
-        taps = [-1, 0, 9, 16, 9, 0, -1]
-        taps = [ tap / 32 for tap in taps ]
-        num_samples = 1024
-        input_width = 8
-        input_samples = self._generate_samples(num_samples, input_width, f_width=7)
 
-        # Compute the expected result
-        input_samples_pad = np.zeros(2*len(input_samples))
-        input_samples_pad[0::2] = 2*input_samples  # pad with zeros, adjust gain
-        filtered_np = np.convolve(input_samples_pad, taps).tolist()
+        common_dut_options = dict(
+            data_shape=fixed.SQ(7),
+            shape_out=fixed.SQ(1,16),
+        )
 
-        # Simulate DUT
-        dut = HalfBandInterpolator(taps, data_shape=fixed.SQ(0, 7), shape_out=fixed.SQ(0,16), always_ready=False)
-        filtered = self._filter(dut, input_samples, len(input_samples) * 2)
+        taps0 = (np.array([-1, 0, 9, 16, 9, 0, -1]) / 32).tolist()
+        taps1 = (np.array([-2, 0, 7, 0, -18, 0, 41, 0, -92, 0, 320, 512, 320, 0, -92, 0, 41, 0, -18, 0, 7, 0, -2]) / 1024).tolist()
+
+        inputs = {
+
+            "test_filter_with_backpressure": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, always_ready=False, num_channels=2, taps=taps1),
+                "sim_opts": dict(empty_cycles=0, empty_ready_cycles=0),
+            },
+
+            "test_filter_with_backpressure_and_empty_cycles": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps0),
+                "sim_opts": dict(empty_ready_cycles=7, empty_cycles=3),
+            },
+
+            "test_filter_with_backpressure_taps1": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps1),
+                "sim_opts": dict(empty_ready_cycles=7, empty_cycles=0),
+            },
+
+            "test_filter_no_backpressure_and_empty_cycles_taps1": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, num_channels=2, always_ready=True, taps=taps0),
+                "sim_opts": dict(empty_cycles=8),
+            },
+
+            "test_filter_no_backpressure": {
+                "num_samples": 1024,
+                "dut_options": dict(**common_dut_options, num_channels=2, always_ready=True, taps=taps1),
+                "sim_opts": dict(empty_cycles=16),
+            },
 
-        self.assertListEqual(filtered_np[:len(filtered)], filtered)
+        }
 
+    
+        for name, scenario in inputs.items():
+            with self.subTest(name):
+                taps        = scenario["dut_options"]["taps"]
+                num_samples = scenario["num_samples"]
+
+                input_width = 8
+                samples_i_in = self._generate_samples(num_samples, input_width, f_width=7)
+                samples_q_in = self._generate_samples(num_samples, input_width, f_width=7)
+
+                # Compute the expected result
+                input_samples_pad = np.zeros(2*len(samples_i_in))
+                input_samples_pad[0::2] = 2*samples_i_in  # pad with zeros, adjust gain
+                filtered_i_np = np.convolve(input_samples_pad, taps).tolist()
+                input_samples_pad = np.zeros(2*len(samples_q_in))
+                input_samples_pad[0::2] = 2*samples_q_in  # pad with zeros, adjust gain
+                filtered_q_np = np.convolve(input_samples_pad, taps).tolist()
+
+                # Simulate DUT
+                dut = HalfBandInterpolator(**scenario["dut_options"])
+                filtered = self._filter(dut, zip(samples_i_in, samples_q_in), len(samples_i_in) * 2, num_channels=2, **scenario["sim_opts"])
+                filtered_i = [ x[0] for x in filtered ]
+                filtered_q = [ x[1] for x in filtered ]
+
+                self.assertListEqual(filtered_i_np[:len(filtered_i)], filtered_i)
+                self.assertListEqual(filtered_q_np[:len(filtered_q)], filtered_q)
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/firmware/fpga/dsp/fir_mac16.py b/firmware/fpga/dsp/fir_mac16.py
index fea4824ad..84c59a899 100644
--- a/firmware/fpga/dsp/fir_mac16.py
+++ b/firmware/fpga/dsp/fir_mac16.py
@@ -7,7 +7,7 @@
 from math                   import ceil, log2
 
 from amaranth               import Module, Signal, Mux, DomainRenamer, ClockSignal, signed
-from amaranth.lib           import wiring, stream, data, memory
+from amaranth.lib           import wiring, stream, data, memory, fifo
 from amaranth.lib.wiring    import In, Out
 from amaranth.utils         import bits_for
 
@@ -58,7 +58,7 @@ def elaborate(self, platform):
 
         if not self.input.signature.always_ready:
             m.d.comb += self.input.ready.eq(~odd | fir.input.ready)
-            m.d.comb += dly.output.ready.eq(1)
+            m.d.comb += dly.output.ready.eq(fir.input.ready)
 
         m.d.comb += [
             dly.input.p.eq(self.input.p),
@@ -126,30 +126,43 @@ def elaborate(self, platform):
 
         taps      = [ 2 * tap for tap in self.taps ]
         arm0_taps = taps[0::2]
+        arm1_taps = taps[1::2]
+        delay     = arm1_taps.index(1)
 
         # Arms
-        m.submodules.fir = fir = FIRFilterMAC16(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, overclock_rate=self.overclock_rate, always_ready=always_ready, num_channels=self.num_channels, delayed_port=True)
+        m.submodules.fir = fir = FIRFilterMAC16(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, overclock_rate=self.overclock_rate, always_ready=always_ready, num_channels=self.num_channels)
+        m.submodules.dly = dly = Delay(delay, shape=self.data_shape, always_ready=always_ready, num_channels=self.num_channels)
+        m.submodules.dly_fifo = dly_fifo = fifo.SyncFIFOBuffered(width=self.num_channels*self.data_shape.as_shape().width, depth=self.overclock_rate+1)
+        
+        m.d.comb += [
+            dly_fifo.w_data.eq(dly.output.p),
+            dly_fifo.w_en.eq(dly.output.valid),
+        ]
+        if not dly.output.signature.always_ready:
+            m.d.comb += dly.output.ready.eq(dly_fifo.w_rdy)
 
-        busy = Signal()
-        with m.If(fir.input.valid & fir.input.ready):
-            m.d.sync += busy.eq(1)
+        #busy = Signal()
+        #with m.If(fir.input.valid & fir.input.ready):
+        #    m.d.sync += busy.eq(1)
 
         # Input
         m.d.comb += fir.input.payload.eq(self.input.payload)
-        m.d.comb += fir.input.valid.eq(self.input.valid & ~busy)
+        m.d.comb += fir.input.valid.eq(self.input.valid & dly.input.ready)
+        m.d.comb += dly.input.payload.eq(self.input.payload)
+        m.d.comb += dly.input.valid.eq(self.input.valid & fir.input.ready)
 
         if not self.input.signature.always_ready:
-            m.d.comb += self.input.ready.eq(fir.input.ready & ~busy)
+            m.d.comb += self.input.ready.eq(fir.input.ready & dly.input.ready)
 
         # Output
 
         # Arm index selection: switch after every delivered sample
         arm_index = Signal()
 
-        delayed = Signal.like(fir.input_delayed)
-        with m.If(fir.output.valid & fir.output.ready):
-            m.d.sync += delayed.eq(fir.input_delayed)
-
+        #delayed = Signal.like(fir.input_delayed)
+        #with m.If(fir.output.valid & fir.output.ready):
+        #    m.d.sync += delayed.eq(fir.input_delayed)
+        r_data_cast = data.ArrayLayout(self.data_shape, self.num_channels)(dly_fifo.r_data)
 
         with m.If(~self.output.valid | self.output.ready):
             with m.Switch(arm_index):
@@ -163,10 +176,11 @@ def elaborate(self, platform):
                         m.d.sync += arm_index.eq(1)
                 with m.Case(1):
                     for c in range(self.num_channels):
-                        m.d.sync += self.output.payload[c].eq(delayed[c])
-                    m.d.sync += self.output.valid.eq(1)
-                    m.d.sync += arm_index.eq(0)
-                    m.d.sync += busy.eq(0)
+                        m.d.sync += self.output.payload[c].eq(r_data_cast[c])
+                    m.d.sync += self.output.valid.eq(dly_fifo.r_rdy)
+                    m.d.comb += dly_fifo.r_en.eq(1)
+                    with m.If(dly_fifo.r_rdy):
+                        m.d.sync += arm_index.eq(0)
         
         if self._domain != "sync":
             m = DomainRenamer(self._domain)(m)
@@ -208,11 +222,12 @@ def __init__(self, taps, shape, shape_out=None, always_ready=False, overclock_ra
             })
         super().__init__(signature)
 
-    def taps_shape(self):
-        taps_as_ratios  = [tap.as_integer_ratio() for tap in self.taps]
+    def taps_shape(self, taps=None):
+        taps            = taps or self.taps
+        taps_as_ratios  = [tap.as_integer_ratio() for tap in taps]
         f_width         = bits_for(max(tap[1] for tap in taps_as_ratios)) - 1
         i_width         = max(0, bits_for(max(abs(tap[0]) for tap in taps_as_ratios)) - f_width)
-        return fixed.Shape(i_width, f_width, signed=any(tap < 0 for tap in self.taps))
+        return fixed.Shape(i_width, f_width, signed=any(tap < 0 for tap in taps))
 
     def compute_output_shape(self):
         taps_shape = self.taps_shape()
@@ -229,101 +244,105 @@ def compute_output_shape(self):
     def elaborate(self, platform):
         m = Module()
 
-        # Build filter out of FIRFilterSerialMAC16 blocks.
+        # Build filter out of SerialMAC16 blocks.
         overclock_factor = self.overclock_rate
 
-        # Symmetric coefficients special case.
-        symmetric = (self.taps == self.taps[::-1])
+        taps = self.taps
+
+        if self.carry is not None:
+            sum_carry_q = Signal.like(self.sum_carry)
 
-        # Even-symmetric case. (N=2*K)
-        # Odd-symmetric case. (N=2*K+1)
+        filters_ready = Signal()
+        window_valid = Signal()
+        input_ready = Signal()
+        m.d.comb += input_ready.eq(~window_valid | filters_ready)
+        if not self.input.signature.always_ready:
+            m.d.comb += self.input.ready.eq(input_ready)
+
+        # Samples window.
+        window = [ Signal.like(self.input.p, name=f"window_{i}") for i in range(len(self.taps)) ]
+
+        with m.If(input_ready):
+            m.d.sync += window_valid.eq(self.input.valid)
+            with m.If(self.input.valid):
+                m.d.sync += window[0].eq(self.input.p)
+                for i in range(1, len(window)):
+                    m.d.sync += window[i].eq(window[i-1])
+                if self.carry is not None:
+                    m.d.sync += sum_carry_q.eq(self.sum_carry)
+
+        # When filter is symmetric, presum samples to obtain a smaller window.
+        symmetric = (self.taps == self.taps[::-1])
         if symmetric:
-            taps = self.taps[:ceil(len(self.taps)/2)]
+            sum_shape = (self.input.p[0] + self.input.p[0]).shape()
             odd_symmetric = ((len(self.taps) % 2) == 1)
+            new_len = len(self.taps) // 2 + odd_symmetric
+            new_window = [ Signal(data.ArrayLayout(sum_shape, self.num_channels), name=f"window_sym_{i}") for i in range(new_len) ]
+            for i in range(len(new_window) - odd_symmetric):
+                for c in range(self.num_channels):
+                    m.d.comb += new_window[i][c].eq(window[i][c] + window[-i-1][c])
+            if odd_symmetric:
+                for c in range(self.num_channels):
+                    m.d.comb += new_window[-1][c].eq(window[len(self.taps)//2][c])
+            window = new_window
+            taps = self.taps[:ceil(len(self.taps)/2)]
+            samples_shape = sum_shape
         else:
-            taps = self.taps
+            samples_shape = self.shape
 
+        # Build filter out of SerialMAC16 blocks: each one multiplies and 
+        # accumulates `overclock_factor` taps serially.
         dsp_block_count = ceil(len(taps) / overclock_factor)
 
-
-        def pipe(signal, length):
-            name = signal.name if hasattr(signal, "name") else "signal"
-            pipe = [ signal ] + [ Signal.like(signal, name=f"{name}_q{i}") for i in range(length) ]
-            for i in range(length):
-                m.d.sync += pipe[i+1].eq(pipe[i])
-            return pipe
-
-
-        if self.carry is not None:
-            sum_carry_q = Signal.like(self.sum_carry)
-            with m.If(self.input.valid & self.input.ready):
-                m.d.sync += sum_carry_q.eq(self.sum_carry)
+        # If we have multiple subfilters, make them all the same size.
+        if dsp_block_count > 1 and len(taps) % overclock_factor != 0:
+            taps = taps + [0]*(overclock_factor - (len(taps)%overclock_factor))
 
         for c in range(self.num_channels):
 
-            last = self.input
             dsp_blocks = []
 
             for i in range(dsp_block_count):
                 taps_slice = taps[i*overclock_factor:(i+1)*overclock_factor]
-                input_delayed = len(taps_slice)
-                carry = last.output.p.shape() if i > 0 else self.carry
+                window_slice = window[i*overclock_factor:(i+1)*overclock_factor]
+                carry = None if i > 0 else self.carry
                 
-                if (i == dsp_block_count-1) and symmetric and odd_symmetric:
-                    taps_slice[-1] /= 2
-                    input_delayed -= 1
-
-                dsp = FIRFilterSerialMAC16(taps=taps_slice, shape=self.shape, taps_shape=self.taps_shape(), carry=carry, symmetry=symmetric,
-                    input_delayed_cycles=input_delayed, always_ready=self.always_ready)
+                dsp = SerialMAC16(taps=taps_slice, shape=samples_shape, taps_shape=self.taps_shape(taps), carry=carry, always_ready=self.always_ready)
                 dsp_blocks.append(dsp)
 
+                for j in range(len(window_slice)):
+                    m.d.comb += dsp.input.p[j].eq(window_slice[j][c])
+                    m.d.comb += dsp.input.valid.eq(window_valid)
+
                 if i == 0:
-                    m.d.comb += [
-                        dsp.input.p         .eq(self.input.p[c]),
-                        dsp.input.valid     .eq(self.input.valid & self.input.ready),
-                    ]
-                    if not self.input.signature.always_ready:
-                        m.d.comb += self.input.ready.eq(dsp.input.ready)
+                    m.d.comb += filters_ready.eq(dsp.input.ready)
                     if self.carry is not None:
                         m.d.comb += dsp.sum_carry.eq(sum_carry_q[c])
-                else:
-                    m.d.comb += [
-                        dsp.input.p         .eq(pipe(last.input_delayed, last.delay())[-1]),
-                        dsp.input.valid     .eq(last.output.valid),
-                        dsp.sum_carry       .eq(last.output.p),
-                    ]
-                    if not last.output.signature.always_ready:
-                        m.d.comb += last.output.ready.eq(dsp.input.ready)
-
-                last = dsp
-
-            if self.delayed_port:
-                m.d.comb += self.input_delayed[c].eq(last.input_delayed)
-
-            if symmetric:
-
-                for i in reversed(range(dsp_block_count)):
-                    end_block = (i == dsp_block_count-1)
-                    m.d.comb += [
-                        dsp_blocks[i].rev_input    .eq(dsp_blocks[i+1].rev_delayed if not end_block else dsp_blocks[i].input_delayed),
-                    ]
             
             m.submodules += dsp_blocks
 
-            m.d.comb += [
-                self.output.payload[c]  .eq(last.output.p),
-                self.output.valid       .eq(last.output.valid),
-            ]
-            if not last.output.signature.always_ready:
-                m.d.comb += last.output.ready.eq(self.output.ready)
+            # Adder tree for channel c
+            if dsp_block_count > 1:
+                with m.If(~self.output.valid | self.output.ready):
+                    for i in range(dsp_block_count):
+                        if not dsp_blocks[i].output.signature.always_ready:
+                            m.d.comb += dsp_blocks[i].output.ready.eq(1)
+                    m.d.sync += self.output.valid.eq(dsp_blocks[0].output.valid)
+                    with m.If(dsp_blocks[0].output.valid):
+                        m.d.sync += self.output.payload[c]  .eq(sum(dsp_blocks[i].output.p for i in range(dsp_block_count)))
+            else:
+                m.d.comb += self.output.payload[c].eq(dsp_blocks[0].output.p)
+                m.d.comb += self.output.valid.eq(dsp_blocks[0].output.valid)
+                if not dsp_blocks[0].output.signature.always_ready:
+                    m.d.comb += dsp_blocks[0].output.ready.eq(self.output.ready)
 
         return m
 
 
-class FIRFilterSerialMAC16(wiring.Component):
+class SerialMAC16(wiring.Component):
 
-    def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, symmetry=False, input_delayed_cycles=None, always_ready=False):
-        assert shape.as_shape().width <= 16, "DSP slice inputs have a maximum width of 16 bit."
+    def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, always_ready=False):
+        assert shape.as_shape().width <= 16, f"DSP slice inputs have a maximum width of 16 bit. {shape} {shape.as_shape().width}"
 
         self.carry = carry
         self.taps = list(taps)
@@ -333,15 +352,8 @@ def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, sym
             shape_out = self.compute_output_shape()
         self.shape_out = shape_out
         self.always_ready = always_ready
-        self.symmetry = symmetry
-        if input_delayed_cycles is None:
-            self.input_delayed_cycles = len(self.taps)
-        else:
-            self.input_delayed_cycles = input_delayed_cycles
-
         signature = {
-            "input":            In(stream.Signature(shape, always_ready=always_ready)),
-            "input_delayed":    Out(shape),
+            "input":            In(stream.Signature(data.ArrayLayout(shape, len(taps)), always_ready=always_ready)),
             "output":           Out(stream.Signature(shape_out, always_ready=always_ready)),
         }
         if carry is not None:
@@ -350,11 +362,6 @@ def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, sym
             })
         else:
             self.sum_carry = 0
-        if symmetry:
-            signature.update({
-                "rev_input": In(shape),
-                "rev_delayed": Out(shape),
-            })
         super().__init__(signature)
 
     def taps_shape(self):
@@ -375,72 +382,36 @@ def compute_output_shape(self):
         shape_out = fixed.Shape(i_width, f_width, signed=signed)
         return shape_out
 
-    def delay(self):
-        return 1 + 1 + 3 + len(self.taps) - 1
-
     def elaborate(self, platform):
         m = Module()
 
         depth = len(self.taps)
         counter_in   = Signal(range(depth))
-        counter_mult = Signal(range(depth))
-        counter_out  = Signal(range(depth))
-        dsp_ready = ~self.output.valid | self.output.ready
-
-        window_valid = Signal()
-        window_ready = dsp_ready
+        dsp_ready = Signal()
         multin_valid = Signal()
 
-
         input_ready = Signal()
         # Ready to process a sample either when the DSP slice is ready and the samples window is:
         # - Not valid yet.
         # - Only valid for 1 more cycle.
-        m.d.comb += input_ready.eq(~window_valid | ((counter_in == depth-1) & window_ready))
+        m.d.comb += input_ready.eq((counter_in == depth-1) & dsp_ready)
         if not self.input.signature.always_ready:
             m.d.comb += self.input.ready.eq(input_ready)
 
-        window = [ Signal.like(self.input.p, name=f"window_{i}") for i in range(max(depth, self.input_delayed_cycles)) ]
-
-        # Sample window.
-        with m.If(input_ready):
-            m.d.sync += window_valid.eq(self.input.valid)
-            with m.If(self.input.valid):
-                m.d.sync += window[0].eq(self.input.p)
-                for i in range(1, len(window)):
-                    m.d.sync += window[i].eq(window[i-1])
-
-        m.d.sync += multin_valid.eq(window_valid)
-
-        dsp_a = Signal.like(self.input.p)
-        with m.Switch(counter_in):
-            for i in range(depth):
-                with m.Case(i):
-                    m.d.sync += dsp_a.eq(window[i])
-
-        m.d.comb += self.input_delayed.eq(window[self.input_delayed_cycles-1])
-
         # Sample counter.
-        with m.If(window_ready & window_valid):
+        with m.If((self.input.valid | (counter_in != 0)) & dsp_ready):
             m.d.sync += counter_in.eq(_incr(counter_in, depth))
 
-        # Symmetry handling.
-        if self.symmetry:
-
-            window_rev = [ Signal.like(self.input.p, name=f"window_rev_{i}") for i in range(depth) ]
+        with m.If(dsp_ready):
+            m.d.sync += multin_valid.eq(self.input.valid | (counter_in != 0))
 
-            with m.If(input_ready & self.input.valid):
-                m.d.sync += window_rev[0].eq(self.rev_input)
-                m.d.sync += [ window_rev[i].eq(window_rev[i-1]) for i in range(1, len(window_rev)) ]
-            
-            m.d.comb += self.rev_delayed.eq(window_rev[-1])
-            
-            dsp_a_rev = Signal.like(self.input.p)
+        # Select sample from window.
+        dsp_a = Signal(self.shape)
+        with m.If(dsp_ready):
             with m.Switch(counter_in):
                 for i in range(depth):
                     with m.Case(i):
-                        m.d.sync += dsp_a_rev.eq(window_rev[depth-1-i])
-
+                        m.d.sync += dsp_a.eq(self.input.p[i])
 
         # Coefficient ROM.
         taps_shape = self.taps_shape
@@ -453,33 +424,38 @@ def elaborate(self, platform):
         m.submodules.coeff_rom = coeff_rom = memory.Memory(data=coeff_data)
         coeff_rd = coeff_rom.read_port(domain="sync")
         m.d.comb += coeff_rd.addr.eq(counter_in)
+        m.d.comb += coeff_rd.en.eq(dsp_ready)
 
         shape_out = self.compute_output_shape()
 
         if self.carry:
             sum_carry_q = Signal.like(self.sum_carry)
-            with m.If(self.input.ready & self.input.valid):
+            with m.If(input_ready):
                 m.d.sync += sum_carry_q.eq(self.sum_carry)
 
-        m.submodules.dsp = dsp = iCE40Multiplier()
-        if self.symmetry:
-            m.d.comb += dsp.a.eq(dsp_a + dsp_a_rev)
-        else:
-            m.d.comb += dsp.a.eq(dsp_a)
+        m.submodules.dsp = dsp = iCE40Multiplier(
+            o_width=shape_out.as_shape().width,
+            always_ready=self.always_ready)
+
+        valid_cnt = Signal(depth, init=1)
+        mult_cnt  = Signal(depth, init=1)
         m.d.comb += [
+            dsp.a               .eq(dsp_a),
             dsp.b               .eq(coeff_rd.data),
             shape_out(dsp.p)    .eq(sum_carry_q if self.carry is not None else 0),
-            dsp.valid_in        .eq(multin_valid & window_ready),
-            dsp.p_load          .eq(counter_mult == 0),
+            dsp.valid_in        .eq(multin_valid),
+            dsp_ready           .eq(dsp.ready_in),
+            dsp.p_load          .eq(mult_cnt[0]),
             self.output.p       .eq(shape_out(dsp.o)),
-            self.output.valid   .eq(dsp.valid_out & (counter_out == depth-1)),
+            self.output.valid   .eq(dsp.valid_out & valid_cnt[-1]),
+            dsp.ready_out       .eq(self.output.ready | ~valid_cnt[-1]),
         ]
         
         # Multiplier input and output counters.
-        with m.If(dsp.valid_in):
-            m.d.sync += counter_mult.eq(_incr(counter_mult, depth))
-        with m.If(dsp.valid_out):
-            m.d.sync += counter_out.eq(_incr(counter_out, depth))
+        with m.If(dsp.valid_in & dsp.ready_in):
+            m.d.sync += mult_cnt.eq(mult_cnt.rotate_left(1))
+        with m.If(dsp.valid_out & dsp.ready_out):
+            m.d.sync += valid_cnt.eq(valid_cnt.rotate_left(1))
 
         return m
 
@@ -487,15 +463,20 @@ def elaborate(self, platform):
 
 class iCE40Multiplier(wiring.Component):
 
-    a:          In(signed(16))
-    b:          In(signed(16))
-    valid_in:   In(1)
-
-    p:          In(signed(32))
-    p_load:     In(1)
-
-    o:          Out(signed(32))
-    valid_out:  Out(1)
+    def __init__(self, a_width=16, b_width=16, p_width=32, o_width=32, always_ready=False):
+        super().__init__({
+            "a": In(signed(a_width)),
+            "b": In(signed(b_width)),
+            "valid_in": In(1),
+            "ready_in": In(1),
+            "p": In(signed(p_width)),
+            "p_load": In(1),
+            "o": Out(signed(o_width)),
+            "valid_out": Out(1),
+            "ready_out": In(1),
+        })
+        self.always_ready = always_ready
+        self.o_width = o_width
    
     def elaborate(self, platform):
         m = Module()
@@ -507,13 +488,20 @@ def pipe(signal, length):
             return pipe
 
         p_load_v    = Signal()
+        valid_v     = Signal()
+        m.d.comb += valid_v.eq(self.valid_in & self.ready_in)
 
         dsp_delay   = 3
-        valid_pipe  = pipe(self.valid_in, dsp_delay)
-        m.d.comb   += p_load_v.eq(self.p_load & self.valid_in)
+        valid_pipe  = pipe(valid_v, dsp_delay)
+        m.d.comb   += p_load_v.eq(self.p_load & valid_v)
         p_pipe      = pipe(self.p, dsp_delay-1)
         p_load_pipe = pipe(p_load_v, dsp_delay - 1)
-        m.d.comb   += self.valid_out.eq(valid_pipe[dsp_delay])
+
+        # skid buffer
+        if not self.always_ready:
+            m.submodules.out_fifo = out_fifo = fifo.SyncFIFOBuffered(width=self.o_width, depth=dsp_delay+2)
+        
+        m.d.comb += self.ready_in.eq(~self.valid_out | self.ready_out)
 
         m.submodules.sb_mac16 = mac = SB_MAC16(
             C_REG=0,
@@ -541,10 +529,10 @@ def pipe(signal, length):
             # Inputs.
             mac.CLK         .eq(ClockSignal("sync")),
             mac.CE          .eq(1),
-            mac.C           .eq(Mux(p_load_pipe[2], p_pipe[2][16:], self.o[16:])),
-            mac.A           .eq(self.a),
-            mac.B           .eq(self.b),
-            mac.D           .eq(Mux(p_load_pipe[2], p_pipe[2][:16], self.o[:16])),
+            mac.C.as_signed().eq(Mux(p_load_pipe[2], p_pipe[2][16:], mac.O[16:])),
+            mac.A.as_signed().eq(self.a),
+            mac.B.as_signed().eq(self.b),
+            mac.D.as_signed().eq(Mux(p_load_pipe[2], p_pipe[2][:16], mac.O[:16])),
             mac.AHOLD       .eq(~valid_pipe[0]),  # 0: load
             mac.BHOLD       .eq(~valid_pipe[0]),
             mac.CHOLD       .eq(0),
@@ -555,11 +543,23 @@ def pipe(signal, length):
             mac.ADDSUBBOT   .eq(0),
             mac.OLOADTOP    .eq(0),
             mac.OLOADBOT    .eq(0),
-            
-            # Outputs.
-            self.o          .eq(mac.O),
         ]
 
+        if not self.always_ready:
+            m.d.comb += [
+                out_fifo.w_data.eq(mac.O),
+                out_fifo.w_en.eq(valid_pipe[dsp_delay]),
+                
+                self.o.eq(out_fifo.r_data),
+                self.valid_out.eq(out_fifo.r_rdy),
+                out_fifo.r_en.eq(self.ready_out),
+            ]
+        else:
+            m.d.comb += [                
+                self.o.eq(mac.O),
+                self.valid_out.eq(valid_pipe[dsp_delay]),
+            ]
+
         return m
 
 
@@ -593,7 +593,7 @@ def _generate_samples(self, count, width, f_width=0):
             return samples / (1 << f_width)
         return samples
 
-    def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0):
+    def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0, empty_ready_cycles=0):
 
         async def input_process(ctx):
             if hasattr(dut, "enable"):
@@ -627,6 +627,10 @@ async def output_process(ctx):
                         filtered.append(payload[0].as_float())
                     else:
                         filtered.append(payload.as_float())
+                if empty_ready_cycles > 0:
+                    ctx.set(dut.output.ready, 0)
+                    await ctx.tick().repeat(empty_ready_cycles)
+                    ctx.set(dut.output.ready, 1)
             if not dut.output.signature.always_ready:
                 ctx.set(dut.output.ready, 0)
 
@@ -645,23 +649,6 @@ async def output_process(ctx):
 
 class TestFIRFilterMAC16(_TestFilter):
 
-    def test_filter_serial(self):
-        taps = [-1, 0, 9, 16, 9, 0, -1]
-        taps = [ tap / 32 for tap in taps ]
-
-        num_samples = 1024
-        input_width = 8
-        input_samples = self._generate_samples(num_samples, input_width)
-
-        # Compute the expected result
-        filtered_np = np.convolve(input_samples, taps).tolist()
-
-        # Simulate DUT
-        dut = FIRFilterSerialMAC16(taps, fixed.SQ(15, 0), always_ready=False)
-        filtered = self._filter(dut, input_samples, len(input_samples))
-
-        self.assertListEqual(filtered_np[:len(filtered)], filtered)
-
     def test_filter(self):
         taps = [-1, 0, 9, 16, 9, 0, -1]
         taps = [ tap / 32 for tap in taps ]
@@ -674,8 +661,8 @@ def test_filter(self):
         filtered_np = np.convolve(input_samples, taps).tolist()
 
         # Simulate DUT
-        dut = FIRFilterMAC16(taps, fixed.SQ(15, 0), always_ready=False)
-        filtered = self._filter(dut, input_samples, len(input_samples))
+        dut = FIRFilterMAC16(taps, shape=fixed.SQ(8, 0), always_ready=False)
+        filtered = self._filter(dut, input_samples, len(input_samples), empty_ready_cycles=5)
 
         self.assertListEqual(filtered_np[:len(filtered)], filtered)
 
@@ -717,7 +704,7 @@ def test_filter(self):
             "test_filter_no_backpressure_and_empty_cycles_taps1": {
                 "num_samples": 1024,
                 "dut_options": dict(**common_dut_options, always_ready=True, taps=taps0),
-                "sim_opts": dict(empty_cycles=3),
+                "sim_opts": dict(empty_cycles=6),
             },
 
             "test_filter_no_backpressure": {
@@ -768,20 +755,20 @@ def test_filter(self):
 
             "test_filter_with_backpressure": {
                 "num_samples": 1024,
-                "dut_options": dict(**common_dut_options, always_ready=False, num_channels=2, taps=taps0),
-                "sim_opts": dict(empty_cycles=0),
+                "dut_options": dict(**common_dut_options, always_ready=False, num_channels=2, taps=taps1),
+                "sim_opts": dict(empty_cycles=0, empty_ready_cycles=0),
             },
 
             "test_filter_with_backpressure_and_empty_cycles": {
                 "num_samples": 1024,
                 "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps0),
-                "sim_opts": dict(empty_cycles=3),
+                "sim_opts": dict(empty_ready_cycles=7, empty_cycles=3),
             },
 
             "test_filter_with_backpressure_taps1": {
                 "num_samples": 1024,
                 "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps1),
-                "sim_opts": dict(empty_cycles=0),
+                "sim_opts": dict(empty_ready_cycles=7, empty_cycles=0),
             },
 
             "test_filter_no_backpressure_and_empty_cycles_taps1": {
diff --git a/firmware/fpga/interface/__init__.py b/firmware/fpga/interface/__init__.py
index a19e3fc2e..530d7af89 100644
--- a/firmware/fpga/interface/__init__.py
+++ b/firmware/fpga/interface/__init__.py
@@ -1 +1,3 @@
-from .max586x import MAX586xInterface
\ No newline at end of file
+from .max586x import MAX586xInterface
+from .spi import SPIRegisterInterface
+from .sgpio import SGPIOInterface
diff --git a/firmware/fpga/interface/max586x.py b/firmware/fpga/interface/max586x.py
index b94d21527..60ffade97 100644
--- a/firmware/fpga/interface/max586x.py
+++ b/firmware/fpga/interface/max586x.py
@@ -9,13 +9,11 @@
 
 from util                   import IQSample
 
+
 class MAX586xInterface(wiring.Component):
-    adc_stream: Out(stream.Signature(IQSample(8), always_ready=True))
+    adc_stream: Out(stream.Signature(IQSample(8), always_ready=True, always_valid=True))
     dac_stream: In(stream.Signature(IQSample(8), always_ready=True))
-
-    adc_capture: In(1)
-    dac_capture: In(1)
-    q_invert:    In(1)
+    q_invert:   In(1)
 
     def __init__(self, bb_domain):
         super().__init__()
@@ -47,10 +45,9 @@ def elaborate(self, platform):
         m.d.comb += [
             adc_stream.p.i      .eq(adc_in.i[0] ^ 0x80),       # I: non-inverted between MAX2837 and MAX5864.
             adc_stream.p.q      .eq(adc_in.i[1] ^ rx_q_mask),  # Q: inverted between MAX2837 and MAX5864.
-            adc_stream.valid    .eq(self.adc_capture),
         ]
 
-        # Output the transformed data to the DAC using a DDR output buffer.
+        # Output to the DAC using a DDR output buffer.
         m.submodules.dac_out = dac_out = io.DDRBuffer("o", platform.request("dd", dir="-"), o_domain=self._bb_domain)
         with m.If(dac_stream.valid):
             m.d.comb += [
diff --git a/firmware/fpga/interface/sgpio.py b/firmware/fpga/interface/sgpio.py
new file mode 100644
index 000000000..52896ac24
--- /dev/null
+++ b/firmware/fpga/interface/sgpio.py
@@ -0,0 +1,202 @@
+#
+# This file is part of HackRF.
+#
+# Copyright (c) 2025 Great Scott Gadgets <info@greatscottgadgets.com>
+# SPDX-License-Identifier: BSD-3-Clause
+
+from amaranth               import Module, Signal, DomainRenamer, EnableInserter, ClockSignal, Instance
+from amaranth.lib           import io, fifo, stream, wiring, cdc
+from amaranth.lib.wiring    import Out, In
+
+from util                   import LinearFeedbackShiftRegister
+
+
+class SGPIOInterface(wiring.Component):
+    
+    def __init__(self, sample_width=8, rx_assignments=None, tx_assignments=None, domain="sync"):
+        self.sample_width = sample_width
+        if rx_assignments is None:
+            rx_assignments = _default_rx_assignments(sample_width // 8)
+        if tx_assignments is None:
+            tx_assignments = _default_tx_assignments(sample_width // 8)
+        self.rx_assignments = rx_assignments
+        self.tx_assignments = tx_assignments
+        self._domain = domain
+        super().__init__({
+            "adc_stream": In(stream.Signature(sample_width, always_ready=True)),
+            "dac_stream": Out(stream.Signature(sample_width)),
+            "trigger_en": In(1),
+            "prbs":       In(1),
+        })
+
+    def elaborate(self, platform):
+        m = Module()
+
+        adc_stream = self.adc_stream
+        dac_stream = self.dac_stream
+        rx_cycles = len(self.rx_assignments)
+        tx_cycles = len(self.tx_assignments)
+
+        direction_i = platform.request("direction").i
+        enable_i    = ~platform.request("disable").i
+        capture_en  = platform.request("capture_en").o
+        m.d.comb += capture_en.eq(1)
+
+        # Determine data transfer direction.
+        direction  = Signal()
+        m.submodules.direction_cdc = cdc.FFSynchronizer(direction_i, direction, o_domain=self._domain)
+        transfer_from_adc = (direction == 0)
+
+        # SGPIO clock and data lines.
+        tx_clk_en      = Signal()
+        rx_clk_en      = Signal()
+        data_to_host   = Signal(self.sample_width)
+        byte_to_host   = Signal(8)
+        data_from_host = Signal(self.sample_width)
+        byte_from_host = Signal(8)
+
+        m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain)
+        m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain)
+
+        m.d.sync += clk_out.o[0].eq(tx_clk_en)
+        m.d.sync += clk_out.o[1].eq(rx_clk_en)
+        m.d.sync += host_io.oe.eq(transfer_from_adc)
+        m.d.comb += host_io.o[0].eq(byte_to_host)
+        m.d.comb += host_io.o[1].eq(byte_to_host)
+        m.d.comb += byte_from_host.eq(host_io.i[1])
+
+        # Transmission is handled differently to account for the latency before the data 
+        # becomes available in the FPGA fabric. 
+        ddr_in_latency = 2  # for iCE40 DDR inputs in Amaranth.
+        tx_write_latency = tx_cycles + ddr_in_latency
+        tx_write_pipe = Signal(tx_write_latency)
+        m.d.sync += tx_write_pipe.eq(tx_write_pipe << 1)
+        for i in range(tx_cycles-1):  # don't store last byte
+            with m.If(tx_write_pipe[ddr_in_latency + i]):
+                m.d.sync += self.tx_assignments[i](data_from_host, byte_from_host)
+
+        # Small TX FIFO to avoid missing samples when the consumer deasserts its ready
+        # signal and transfers are in progress.
+        m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=self.sample_width, depth=16)
+        m.d.comb += [
+            tx_fifo.w_data      .eq(data_from_host),
+            self.tx_assignments[-1](tx_fifo.w_data, byte_from_host),
+            tx_fifo.w_en        .eq(tx_write_pipe[-1]),
+            dac_stream.p        .eq(tx_fifo.r_data),
+            dac_stream.valid    .eq(tx_fifo.r_rdy),
+            tx_fifo.r_en        .eq(dac_stream.ready),
+        ]
+
+        # Pseudo-random binary sequence generator.
+        prbs_advance = Signal()
+        prbs_count = Signal(2)
+        m.submodules.prbs = prbs = EnableInserter(prbs_advance)(
+            LinearFeedbackShiftRegister(degree=8, taps=[8,6,5,4], init=0b10110001))
+
+
+        # Capture signal generation.
+        capture = Signal()
+        m.submodules.trigger_gen = trigger_gen = FlowAndTriggerControl(domain=self._domain)
+        m.d.comb += [
+            trigger_gen.enable.eq(enable_i),
+            trigger_gen.trigger_en.eq(self.trigger_en),
+            capture.eq(trigger_gen.capture),
+        ]
+
+
+        # Main state machine.
+        with m.FSM():
+            with m.State("IDLE"): 
+
+                with m.If(transfer_from_adc):
+                    with m.If(self.prbs):
+                        m.next = "PRBS"
+                    with m.Elif(adc_stream.valid & capture):
+                        m.d.comb += rx_clk_en.eq(1)
+                        m.d.sync += data_to_host.eq(adc_stream.p)
+                        m.d.sync += byte_to_host.eq(self.rx_assignments[0](adc_stream.p))
+                        if rx_cycles > 1:
+                            m.next = "RX0"
+                with m.Else():
+                    with m.If(dac_stream.ready & capture):
+                        m.d.comb += tx_clk_en.eq(1)
+                        m.d.sync += tx_write_pipe[0].eq(capture)
+                        if tx_cycles > 1:
+                            m.next = "TX0"
+
+            for i in range(rx_cycles-1):
+                with m.State(f"RX{i}"):
+                    m.d.comb += rx_clk_en.eq(1)
+                    m.d.sync += byte_to_host.eq(self.rx_assignments[i+1](data_to_host))
+                    m.next = "IDLE" if i == rx_cycles-2 else f"RX{i+1}"
+
+            for i in range(tx_cycles-1):
+                with m.State(f"TX{i}"):
+                    m.d.comb += tx_clk_en.eq(1)
+                    m.next = "IDLE" if i == tx_cycles-2 else f"TX{i+1}"
+
+            with m.State("PRBS"): 
+                m.d.comb += rx_clk_en.eq(prbs_count == 0)
+                m.d.comb += prbs_advance.eq(prbs_count == 0)
+                m.d.sync += byte_to_host.eq(prbs.value)
+                m.d.sync += prbs_count.eq(prbs_count + 1)
+                with m.If(~self.prbs):
+                    m.next = "IDLE"
+
+        # Convert to other clock domain if necessary.
+        if self._domain != "sync":
+            m = DomainRenamer(self._domain)(m)
+
+        return m
+
+
+def _default_rx_assignments(n):
+    def rx_assignment(i):
+        def _f(w):
+            return w.word_select(i, 8)
+        return _f
+    return [ rx_assignment(i) for i in range(n) ]
+
+def _default_tx_assignments(n):
+    def tx_assignment(i):
+        def _f(w, v):
+            return w.word_select(i, 8).eq(v)
+        return _f
+    return [ tx_assignment(i) for i in range(n) ]
+
+
+class FlowAndTriggerControl(wiring.Component):
+    trigger_en:  In(1)
+    enable:      In(1)
+    capture:     Out(1)
+
+    def __init__(self, domain):
+        super().__init__()
+        self._domain = domain
+
+    def elaborate(self, platform):
+        m = Module()
+
+        #
+        # Signal synchronization and trigger logic.
+        #
+        trigger_enable = self.trigger_en
+        trigger_in     = platform.request("trigger_in").i
+        trigger_out    = platform.request("trigger_out").o
+        m.d.comb += trigger_out.eq(self.enable)
+
+        # Create a latch for the trigger input signal using a special FPGA primitive.
+        trigger_in_latched = Signal()
+        trigger_in_reg = Instance("SB_DFFES",
+            i_D = 0,
+            i_S = trigger_in,  # async set
+            i_E = ~self.enable,
+            i_C = ClockSignal(self._domain),
+            o_Q = trigger_in_latched
+        )
+        m.submodules.trigger_in_reg = trigger_in_reg
+
+        # Export signal for capture gating.
+        m.d[self._domain] += self.capture.eq(self.enable & (trigger_in_latched | ~trigger_enable))
+
+        return m
diff --git a/firmware/fpga/requirements.txt b/firmware/fpga/requirements.txt
index 4b676b22c..87b248b05 100644
--- a/firmware/fpga/requirements.txt
+++ b/firmware/fpga/requirements.txt
@@ -1,3 +1,4 @@
 amaranth==v0.5.8
 amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@23c66d6
 lz4
+numpy
diff --git a/firmware/fpga/top/ext_precision_rx.py b/firmware/fpga/top/ext_precision_rx.py
index 6eb3f138d..3950458e9 100644
--- a/firmware/fpga/top/ext_precision_rx.py
+++ b/firmware/fpga/top/ext_precision_rx.py
@@ -4,15 +4,13 @@
 # Copyright (c) 2025 Great Scott Gadgets <info@greatscottgadgets.com>
 # SPDX-License-Identifier: BSD-3-Clause
 
-from amaranth               import Elaboratable, Module, Signal, Mux, Instance, Cat, ClockSignal, DomainRenamer
-from amaranth.lib           import io, fifo, stream, wiring
-from amaranth.lib.wiring    import Out, In, connect
+from amaranth               import Elaboratable, Module, Cat, DomainRenamer
+from amaranth.lib.wiring    import connect
 
 from amaranth_future        import fixed
 
 from board                  import PralinePlatform, ClockDomainGenerator
-from interface              import MAX586xInterface
-from interface.spi          import SPIRegisterInterface
+from interface              import MAX586xInterface, SGPIOInterface, SPIRegisterInterface
 from dsp.fir                import FIRFilter
 from dsp.fir_mac16          import HalfBandDecimatorMAC16
 from dsp.cic                import CICDecimator
@@ -21,119 +19,6 @@
 from util                   import ClockConverter, IQSample
 
 
-class MCUInterface(wiring.Component):
-    adc_stream: In(stream.Signature(IQSample(12), always_ready=True))
-    direction:  In(1)
-    enable:     In(1)
-    
-    def __init__(self, domain="sync"):
-        self._domain = domain
-        super().__init__()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        adc_stream = self.adc_stream
-
-        # Determine data transfer direction.
-        direction = Signal()
-        enable    = Signal()
-        m.d.sync += enable.eq(self.enable)
-        m.d.sync += direction.eq(self.direction)
-        transfer_from_adc = (direction == 0)
-
-        # SGPIO clock and data lines.
-        m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain)
-        m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain)
-
-        # State machine to control SGPIO clock and data lines.
-        rx_clk_en = Signal()
-        m.d.sync += clk_out.o[1].eq(rx_clk_en)
-        m.d.sync += host_io.oe.eq(transfer_from_adc)
-
-        data_to_host = Signal.like(adc_stream.p)
-        rx_data_buffer = Signal(8)
-        m.d.comb += host_io.o[0].eq(rx_data_buffer)
-        m.d.comb += host_io.o[1].eq(rx_data_buffer)
-
-        with m.FSM():
-            with m.State("IDLE"):
-                m.d.comb += rx_clk_en.eq(enable & transfer_from_adc & adc_stream.valid)
-
-                with m.If(rx_clk_en):
-                    m.d.sync += rx_data_buffer.eq(adc_stream.p.i >> 8)
-                    m.d.sync += data_to_host.eq(adc_stream.p)
-                    m.next = "RX_I1"
-
-            with m.State("RX_I1"):
-                m.d.comb += rx_clk_en.eq(1)
-                m.d.sync += rx_data_buffer.eq(data_to_host.i)
-                m.next = "RX_Q0"
-
-            with m.State("RX_Q0"):
-                m.d.comb += rx_clk_en.eq(1)
-                m.d.sync += rx_data_buffer.eq(data_to_host.q >> 8)
-                m.next = "RX_Q1"
-
-            with m.State("RX_Q1"):
-                m.d.comb += rx_clk_en.eq(1)
-                m.d.sync += rx_data_buffer.eq(data_to_host.q)
-                m.next = "IDLE"
-
-        if self._domain != "sync":
-            m = DomainRenamer(self._domain)(m)
-
-        return m
-
-
-class FlowAndTriggerControl(wiring.Component):
-    trigger_en:  In(1)
-    direction:   Out(1)  # async
-    enable:      Out(1)  # async
-    adc_capture: Out(1)
-    dac_capture: Out(1)
-
-    def __init__(self, domain):
-        super().__init__()
-        self._domain = domain
-
-    def elaborate(self, platform):
-        m = Module()
-
-        #
-        # Signal synchronization and trigger logic.
-        #
-        trigger_enable   = self.trigger_en
-        trigger_in       =  platform.request("trigger_in").i
-        trigger_out      =  platform.request("trigger_out").o
-        host_data_enable = ~platform.request("disable").i
-        m.d.comb += trigger_out.eq(host_data_enable)
-
-        # Create a latch for the trigger input signal using a special FPGA primitive.
-        trigger_in_latched = Signal()
-        trigger_in_reg = Instance("SB_DFFES",
-            i_D = 0,
-            i_S = trigger_in,  # async set
-            i_E = ~host_data_enable,
-            i_C = ClockSignal(self._domain),
-            o_Q = trigger_in_latched
-        )
-        m.submodules.trigger_in_reg = trigger_in_reg
-
-        # Export signals for direction control and capture gating.
-        m.d.comb += self.direction.eq(platform.request("direction").i)
-        m.d.comb += self.enable.eq(host_data_enable)
-        
-        with m.If(host_data_enable):
-            m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0))
-            m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1))
-        with m.Else():
-            m.d[self._domain] += self.adc_capture.eq(0)
-            m.d[self._domain] += self.dac_capture.eq(0)
-
-        return m
-
-
 class Top(Elaboratable):
 
     def elaborate(self, platform):
@@ -142,15 +27,25 @@ def elaborate(self, platform):
         m.submodules.clkgen = ClockDomainGenerator()
 
         # Submodules.
-        m.submodules.flow_ctl    = flow_ctl    = FlowAndTriggerControl(domain="gck1")
         m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1")
-        m.submodules.mcu_intf    = mcu_intf    = MCUInterface(domain="sync")
+        m.submodules.mcu_intf    = mcu_intf    = SGPIOInterface(
+            sample_width=24,
+            rx_assignments=[
+                lambda w: Cat(w[8:12], w[11].replicate(4)),
+                lambda w: w[0:8],
+                lambda w: Cat(w[20:24], w[23].replicate(4)),
+                lambda w: w[12:20],
+            ],
+            tx_assignments=[
+                lambda w, v: w[8:12].eq(v),
+                lambda w, v: w[0:8].eq(v),
+                lambda w, v: w[20:24].eq(v),
+                lambda w, v: w[12:20].eq(v),
+            ],
+            domain="sync"
+        )
 
-        m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture)
-        m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture)
         m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i)
-        m.d.comb += mcu_intf.direction.eq(flow_ctl.direction)
-        m.d.comb += mcu_intf.enable.eq(flow_ctl.enable)
 
         # Half-band filter taps.
         taps_hb1 = [-2, 0, 5, 0, -10, 0,18, 0, -30, 0,53, 0,-101, 0, 323, 512, 323, 0,-101, 0, 53, 0, -30, 0,18, 0, -10, 0, 5, 0,-2]
@@ -173,7 +68,7 @@ def elaborate(self, platform):
             "hbfir2":       HalfBandDecimatorMAC16(taps_hb2, data_shape=fixed.SQ(11), overclock_rate=8, always_ready=True, domain="gck1"),
 
             # Clock domain conversion.
-            "clkconv":      ClockConverter(IQSample(12), 4, "gck1", "sync", always_ready=True),
+            "clkconv":      ClockConverter(IQSample(12), 8, "gck1", "sync", always_ready=True),
         }
         for k,v in rx_chain.items():
             m.submodules[f"rx_{k}"] = v
@@ -196,7 +91,7 @@ def elaborate(self, platform):
 
         m.d.comb += [
             # Trigger enable.
-            flow_ctl.trigger_en                 .eq(ctrl[7]),
+            mcu_intf.trigger_en                 .eq(ctrl[7]),
 
             # RX settings.
             rx_chain["dc_block"].enable         .eq(ctrl[0]),
diff --git a/firmware/fpga/top/ext_precision_tx.py b/firmware/fpga/top/ext_precision_tx.py
index 4268606d1..6b55acc48 100644
--- a/firmware/fpga/top/ext_precision_tx.py
+++ b/firmware/fpga/top/ext_precision_tx.py
@@ -4,140 +4,19 @@
 # Copyright (c) 2025 Great Scott Gadgets <info@greatscottgadgets.com>
 # SPDX-License-Identifier: BSD-3-Clause
 
-from amaranth               import Elaboratable, Module, Signal, Instance, Cat, ClockSignal, DomainRenamer
-from amaranth.lib           import io, fifo, stream, wiring
-from amaranth.lib.wiring    import Out, In, connect
+from amaranth               import Elaboratable, Module, Cat, DomainRenamer
+from amaranth.lib.wiring    import connect
 
 from amaranth_future        import fixed
 
 from board                  import PralinePlatform, ClockDomainGenerator
-from interface              import MAX586xInterface
-from interface.spi          import SPIRegisterInterface
+from interface              import MAX586xInterface, SGPIOInterface, SPIRegisterInterface
 from dsp.fir                import FIRFilter
 from dsp.fir_mac16          import HalfBandInterpolatorMAC16
 from dsp.cic                import CICInterpolator
 from util                   import ClockConverter, IQSample, StreamSkidBuffer
 
 
-class MCUInterface(wiring.Component):
-    dac_stream: Out(stream.Signature(IQSample(12)))
-    direction:  In(1)
-    enable:     In(1)
-    
-    def __init__(self, domain="sync"):
-        self._domain = domain
-        super().__init__()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        dac_stream = self.dac_stream
-
-        # Determine data transfer direction.
-        direction = Signal()
-        enable    = Signal()
-        m.d.sync += enable.eq(self.enable)
-        m.d.sync += direction.eq(self.direction)
-        transfer_to_dac   = (direction == 1)
-
-        # SGPIO clock and data lines.
-        m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain)
-        m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain)
-
-        # State machine to control SGPIO clock and data lines.
-        tx_clk_en = Signal()
-        m.d.sync += clk_out.o[0].eq(tx_clk_en)
-
-        tx_dly_write = Signal(4)
-        tx_in_sample = Signal(4*8)
-        m.d.sync += tx_dly_write.eq(tx_dly_write << 1)
-        m.d.sync += tx_in_sample.eq(Cat(host_io.i[1], tx_in_sample))
-
-        # Small TX FIFO to avoid overflows from the write delay.
-        m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=24, depth=4)
-        m.d.comb += [
-            tx_fifo.w_data.word_select(0, 12)   .eq(tx_in_sample[20:32]),
-            tx_fifo.w_data.word_select(1, 12)   .eq(tx_in_sample[4:16]),
-            tx_fifo.w_en                        .eq(tx_dly_write[-1]),
-            dac_stream.p                        .eq(tx_fifo.r_data),
-            dac_stream.valid                    .eq(tx_fifo.r_rdy),
-            tx_fifo.r_en                        .eq(dac_stream.ready),
-        ]
-
-        with m.FSM():
-            with m.State("IDLE"):
-                m.d.comb += tx_clk_en.eq(enable & transfer_to_dac & dac_stream.ready)
-
-                with m.If(tx_clk_en):
-                    m.next = "TX_I1"
-
-            with m.State("TX_I1"):
-                m.d.comb += tx_clk_en.eq(1)
-                m.next = "TX_Q0"
-
-            with m.State("TX_Q0"):
-                m.d.comb += tx_clk_en.eq(1)
-                m.next = "TX_Q1"
-
-            with m.State("TX_Q1"):
-                m.d.comb += tx_clk_en.eq(1)
-                m.d.sync += tx_dly_write[0].eq(1)  # delayed write
-                m.next = "IDLE"
-
-        if self._domain != "sync":
-            m = DomainRenamer(self._domain)(m)
-
-        return m
-
-
-class FlowAndTriggerControl(wiring.Component):
-    trigger_en:  In(1)
-    direction:   Out(1)  # async
-    enable:      Out(1)  # async
-    adc_capture: Out(1)
-    dac_capture: Out(1)
-
-    def __init__(self, domain):
-        super().__init__()
-        self._domain = domain
-
-    def elaborate(self, platform):
-        m = Module()
-
-        #
-        # Signal synchronization and trigger logic.
-        #
-        trigger_enable   = self.trigger_en
-        trigger_in       =  platform.request("trigger_in").i
-        trigger_out      =  platform.request("trigger_out").o
-        host_data_enable = ~platform.request("disable").i
-        m.d.comb += trigger_out.eq(host_data_enable)
-
-        # Create a latch for the trigger input signal using a special FPGA primitive.
-        trigger_in_latched = Signal()
-        trigger_in_reg = Instance("SB_DFFES",
-            i_D = 0,
-            i_S = trigger_in,  # async set
-            i_E = ~host_data_enable,
-            i_C = ClockSignal(self._domain),
-            o_Q = trigger_in_latched
-        )
-        m.submodules.trigger_in_reg = trigger_in_reg
-
-        # Export signals for direction control and capture gating.
-        m.d.comb += self.direction.eq(platform.request("direction").i)
-        m.d.comb += self.enable.eq(host_data_enable)
-        
-        with m.If(host_data_enable):
-            m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0))
-            m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1))
-        with m.Else():
-            m.d[self._domain] += self.adc_capture.eq(0)
-            m.d[self._domain] += self.dac_capture.eq(0)
-
-        return m
-
-
 class Top(Elaboratable):
 
     def elaborate(self, platform):
@@ -146,15 +25,27 @@ def elaborate(self, platform):
         m.submodules.clkgen = ClockDomainGenerator()
 
         # Submodules.
-        m.submodules.flow_ctl    = flow_ctl    = FlowAndTriggerControl(domain="gck1")
         m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1")
-        m.submodules.mcu_intf    = mcu_intf    = MCUInterface(domain="sync")
+        m.submodules.mcu_intf    = mcu_intf    = SGPIOInterface(
+            sample_width=24,
+            rx_assignments=[
+                lambda w: Cat(w[8:12], w[11].replicate(4)),
+                lambda w: w[0:8],
+                lambda w: Cat(w[20:24], w[23].replicate(4)),
+                lambda w: w[12:20],
+            ],
+            tx_assignments=[
+                lambda w, v: w[8:12].eq(v),
+                lambda w, v: w[0:8].eq(v),
+                lambda w, v: w[20:24].eq(v),
+                lambda w, v: w[12:20].eq(v),
+            ],
+            domain="sync"
+        )
 
-        m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture)
         m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i)
-        m.d.comb += mcu_intf.direction.eq(flow_ctl.direction)
-        m.d.comb += mcu_intf.enable.eq(flow_ctl.enable)
 
+        
         # Half-band filter taps.
         taps_hb1 = [-2, 0, 5, 0, -10, 0,18, 0, -30, 0,53, 0,-101, 0, 323, 512, 323, 0,-101, 0, 53, 0, -30, 0,18, 0, -10, 0, 5, 0,-2]
         taps_hb1 = [ tap/1024 for tap in taps_hb1 ]
@@ -164,7 +55,7 @@ def elaborate(self, platform):
 
         tx_chain = {
             # Clock domain conversion.
-            "clkconv":          ClockConverter(IQSample(12), 4, "sync", "gck1", always_ready=False),
+            "clkconv":          ClockConverter(IQSample(12), 8, "sync", "gck1", always_ready=False),
 
             # Half-band interpolation stages (+ skid buffers for timing closure).
             "hbfir1":           HalfBandInterpolatorMAC16(taps_hb1, data_shape=fixed.SQ(11),
@@ -176,7 +67,6 @@ def elaborate(self, platform):
 
             # CIC interpolation stage.
             "cic_comp":         DomainRenamer("gck1")(FIRFilter([-0.125, 0, 0.75, 0, -0.125], shape=fixed.SQ(11), shape_out=fixed.SQ(11), always_ready=False, num_channels=2)),
-
             "cic_interpolator": CICInterpolator(2, 4, (4, 8, 16, 32), 12, 8, num_channels=2, 
                 always_ready=False, domain="gck1"),
         }
@@ -201,7 +91,7 @@ def elaborate(self, platform):
 
         m.d.comb += [
             # Trigger enable.
-            flow_ctl.trigger_en                 .eq(ctrl[7]),
+            mcu_intf.trigger_en                 .eq(ctrl[7]),
 
             # TX interpolation rate.
             tx_chain["cic_interpolator"].factor .eq(tx_intrp + 2),
diff --git a/firmware/fpga/top/half_precision.py b/firmware/fpga/top/half_precision.py
index 4cc0e20b0..974f68eee 100644
--- a/firmware/fpga/top/half_precision.py
+++ b/firmware/fpga/top/half_precision.py
@@ -5,128 +5,17 @@
 # Copyright (c) 2024 Great Scott Gadgets <info@greatscottgadgets.com>
 # SPDX-License-Identifier: BSD-3-Clause
 
-from amaranth               import Elaboratable, Module, Signal, C, Mux, Instance, Cat, ClockSignal, DomainRenamer, signed
-from amaranth.lib           import io, stream, wiring, cdc, data, fifo
+from amaranth               import Elaboratable, Module, DomainRenamer
+from amaranth.lib           import stream, wiring
 from amaranth.lib.wiring    import Out, In, connect
 
 from board                  import PralinePlatform, ClockDomainGenerator
-from interface              import MAX586xInterface
-from interface.spi          import SPIRegisterInterface
+from interface              import MAX586xInterface, SGPIOInterface, SPIRegisterInterface
 from dsp.dc_block           import DCBlock
 from dsp.round              import convergent_round
 from util                   import IQSample, ClockConverter
 
 
-class MCUInterface(wiring.Component):
-    adc_stream: In(stream.Signature(IQSample(4), always_ready=True))
-    dac_stream: Out(stream.Signature(IQSample(4)))
-    direction:  In(1)
-    enable:     In(1)
-    
-    def __init__(self, domain="sync"):
-        self._domain = domain
-        super().__init__()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        adc_stream = self.adc_stream
-        dac_stream = self.dac_stream
-
-        # Determine data transfer direction.
-        direction = Signal()
-        enable    = Signal()
-        m.d.sync += enable.eq(self.enable)
-        m.d.sync += direction.eq(self.direction)
-        transfer_from_adc = (direction == 0)
-        transfer_to_dac   = (direction == 1)
-
-        # SGPIO clock and data lines.
-        m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain)
-        m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain)
-
-        # State machine to control SGPIO clock and data lines.
-        m.d.sync += clk_out.o[0].eq(0)
-        m.d.sync += clk_out.o[1].eq(0)
-        m.d.sync += host_io.oe.eq(transfer_from_adc)
-
-        data_to_host = Signal.like(Cat(adc_stream.p.i, adc_stream.p.q))
-        assert len(data_to_host) == 8
-        m.d.comb += host_io.o[0].eq(data_to_host)
-        m.d.comb += host_io.o[1].eq(data_to_host)
-
-        tx_dly_write = Signal(2)
-        m.d.sync += tx_dly_write.eq(tx_dly_write << 1)
-        m.d.comb += dac_stream.payload.eq(host_io.i[1])
-        m.d.comb += dac_stream.valid.eq(tx_dly_write[-1])
-
-        with m.FSM():
-            with m.State("IDLE"):
-                with m.If(enable):
-                    with m.If(transfer_from_adc & adc_stream.valid):
-                        m.d.sync += data_to_host.eq(Cat(adc_stream.p.i, adc_stream.p.q))
-                        m.d.sync += clk_out.o[1].eq(1)
-
-                    with m.Elif(transfer_to_dac & dac_stream.ready):
-                        m.d.sync += clk_out.o[0].eq(1)
-                        m.d.sync += tx_dly_write[0].eq(1)  # delayed write
-
-        if self._domain != "sync":
-            m = DomainRenamer(self._domain)(m)
-
-        return m
-
-
-class FlowAndTriggerControl(wiring.Component):
-    trigger_en:  In(1)
-    direction:   Out(1)  # async
-    enable:      Out(1)  # async
-    adc_capture: Out(1)
-    dac_capture: Out(1)
-
-    def __init__(self, domain):
-        super().__init__()
-        self._domain = domain
-
-    def elaborate(self, platform):
-        m = Module()
-
-        #
-        # Signal synchronization and trigger logic.
-        #
-        trigger_enable   =  self.trigger_en
-        trigger_in       =  platform.request("trigger_in").i
-        trigger_out      =  platform.request("trigger_out").o
-        host_data_enable = ~platform.request("disable").i
-        m.d.comb += trigger_out.eq(host_data_enable)
-
-        # Create a latch for the trigger input signal using a FPGA primitive.
-        trigger_in_latched = Signal()
-        trigger_in_reg = Instance("SB_DFFES",
-            i_D = 0,
-            i_S = trigger_in,  # async set
-            i_E = ~host_data_enable,
-            i_C = ClockSignal(self._domain),
-            o_Q = trigger_in_latched
-        )
-        m.submodules.trigger_in_reg = trigger_in_reg
-
-        # Export signals for direction control and gating captures.
-        m.d.comb += self.direction.eq(platform.request("direction").i)
-        m.d.comb += self.enable.eq(host_data_enable)
-        
-        with m.If(host_data_enable):
-            m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0))
-            m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1))
-        with m.Else():
-            m.d[self._domain] += self.adc_capture.eq(0)
-            m.d[self._domain] += self.dac_capture.eq(0)
-
-        return m
-
-
-
-
 class IQHalfPrecisionConverter(wiring.Component):
     input:  In(stream.Signature(IQSample(8), always_ready=True))
     output: Out(stream.Signature(IQSample(4), always_ready=True))
@@ -167,22 +56,18 @@ def elaborate(self, platform):
         m.submodules.clkgen = ClockDomainGenerator()
 
         # Submodules.
-        m.submodules.flow_ctl    = flow_ctl    = FlowAndTriggerControl(domain="gck1")
         m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1")
-        m.submodules.mcu_intf    = mcu_intf    = MCUInterface(domain="sync")
+        m.submodules.mcu_intf    = mcu_intf    = SGPIOInterface(sample_width=8, domain="sync")
 
-        m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture)
-        m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture)
         m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i)
-        m.d.comb += mcu_intf.direction.eq(flow_ctl.direction)
-        m.d.comb += mcu_intf.enable.eq(flow_ctl.enable)
 
         rx_chain = {
             "dc_block":      DCBlock(width=8, num_channels=2, domain="gck1"),
             "half_prec":     DomainRenamer("gck1")(IQHalfPrecisionConverter()),
-            "clkconv":       ClockConverter(IQSample(4), 4, "gck1", "sync"),
+            "clkconv":       ClockConverter(IQSample(4), 16, "gck1", "sync"),
         }
-        m.submodules += rx_chain.values()
+        for k,v in rx_chain.items():
+            m.submodules[f"rx_{k}"] = v
 
         # Connect receiver chain.
         last = adcdac_intf.adc_stream
@@ -193,10 +78,11 @@ def elaborate(self, platform):
 
         
         tx_chain = {
-            "clkconv":       ClockConverter(IQSample(4), 4, "sync", "gck1", always_ready=False),
+            "clkconv":       ClockConverter(IQSample(4), 16, "sync", "gck1", always_ready=False),
             "half_prec":     DomainRenamer("gck1")(IQHalfPrecisionConverterInv()),
         }
-        m.submodules += tx_chain.values()
+        for k,v in tx_chain.items():
+            m.submodules[f"tx_{k}"] = v
 
         # Connect transmitter chain.
         last = mcu_intf.dac_stream
@@ -213,7 +99,7 @@ def elaborate(self, platform):
         ctrl  = spi_regs.add_register(0x01, init=0)
         m.d.comb += [
             # Trigger enable.
-            flow_ctl.trigger_en                 .eq(ctrl[7]),
+            mcu_intf.trigger_en                 .eq(ctrl[7]),
 
             # RX settings.
             rx_chain["dc_block"].enable         .eq(ctrl[0]),
@@ -224,4 +110,4 @@ def elaborate(self, platform):
 
 if __name__ == "__main__":
     plat = PralinePlatform()
-    plat.build(Top_HP())
+    plat.build(Top())
diff --git a/firmware/fpga/top/standard.py b/firmware/fpga/top/standard.py
index 50c73df8b..7f85925b1 100644
--- a/firmware/fpga/top/standard.py
+++ b/firmware/fpga/top/standard.py
@@ -4,168 +4,20 @@
 # Copyright (c) 2025 Great Scott Gadgets <info@greatscottgadgets.com>
 # SPDX-License-Identifier: BSD-3-Clause
 
-from amaranth               import Elaboratable, Module, Signal, Mux, Instance, Cat, ClockSignal, DomainRenamer, EnableInserter
-from amaranth.lib           import io, fifo, stream, wiring, cdc
-from amaranth.lib.wiring    import Out, In, connect
+from amaranth               import Elaboratable, Module, Signal, Mux, DomainRenamer
+from amaranth.lib           import cdc
+from amaranth.lib.wiring    import connect
 
 from amaranth_future        import fixed
 
 from board                  import PralinePlatform, ClockDomainGenerator
-from interface              import MAX586xInterface
-from interface.spi          import SPIRegisterInterface
+from interface              import MAX586xInterface, SGPIOInterface, SPIRegisterInterface
 from dsp.fir                import HalfBandDecimator, HalfBandInterpolator
-from dsp.cic                import CICDecimator, CICInterpolator
+from dsp.cic                import CICInterpolator
 from dsp.dc_block           import DCBlock
 from dsp.quarter_shift      import QuarterShift
 from dsp.nco                import NCO
-from util                   import ClockConverter, IQSample, StreamSkidBuffer, LinearFeedbackShiftRegister
-
-
-class MCUInterface(wiring.Component):
-    adc_stream: In(stream.Signature(IQSample(8), always_ready=True))
-    dac_stream: Out(stream.Signature(IQSample(8)))
-    direction:  In(1)
-    enable:     In(1)
-    prbs:       In(1)
-    
-    def __init__(self, domain="sync"):
-        self._domain = domain
-        super().__init__()
-
-    def elaborate(self, platform):
-        m = Module()
-
-        adc_stream = self.adc_stream
-        dac_stream = self.dac_stream
-
-        # Determine data transfer direction.
-        direction = Signal()
-        enable    = Signal()
-        m.submodules.enable_cdc = cdc.FFSynchronizer(self.enable, enable, o_domain=self._domain)
-        m.submodules.direction_cdc = cdc.FFSynchronizer(self.direction, direction, o_domain=self._domain)
-        transfer_from_adc = (direction == 0)
-        transfer_to_dac   = (direction == 1)
-
-        # SGPIO clock and data lines.
-        m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain)
-        m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain)
-
-        # State machine to control SGPIO clock and data lines.
-        tx_clk_en = Signal()
-        rx_clk_en = Signal()
-        m.d.sync += clk_out.o[0].eq(tx_clk_en)
-        m.d.sync += clk_out.o[1].eq(rx_clk_en)
-        m.d.sync += host_io.oe.eq(transfer_from_adc)
-
-        data_to_host = Signal.like(adc_stream.p)
-        m.d.comb += host_io.o[0].eq(data_to_host)
-        m.d.comb += host_io.o[1].eq(data_to_host)
-
-        tx_dly_write = Signal(3)
-        host_io_prev_data = Signal(8)
-        m.d.sync += tx_dly_write.eq(tx_dly_write << 1)
-        m.d.sync += host_io_prev_data.eq(host_io.i[1])
-
-        # Small TX FIFO to avoid overflows from the write delay.
-        m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=16, depth=8)
-        m.d.comb += [
-            tx_fifo.w_data      .eq(Cat(host_io_prev_data, host_io.i[1])),
-            tx_fifo.w_en        .eq(tx_dly_write[-1]),
-            dac_stream.p        .eq(tx_fifo.r_data),
-            dac_stream.valid    .eq(tx_fifo.r_rdy),
-            tx_fifo.r_en        .eq(dac_stream.ready),
-        ]
-
-        # Pseudo-random binary sequence generator.
-        prbs_advance = Signal()
-        prbs_count = Signal(2)
-        m.submodules.prbs = prbs = EnableInserter(prbs_advance)(
-            LinearFeedbackShiftRegister(degree=8, taps=[8,6,5,4], init=0b10110001))
-
-        with m.FSM():
-            with m.State("IDLE"):
-                m.d.comb += tx_clk_en.eq(enable & transfer_to_dac & dac_stream.ready)
-                m.d.comb += rx_clk_en.eq(enable & transfer_from_adc & adc_stream.valid)
-
-                with m.If(self.prbs):
-                    m.next = "PRBS"
-                with m.Elif(rx_clk_en):
-                    m.d.sync += data_to_host.eq(adc_stream.p)
-                    m.next = "RX_Q"
-                with m.Elif(tx_clk_en):
-                    m.next = "TX_Q"
-
-            with m.State("RX_Q"):
-                m.d.comb += rx_clk_en.eq(1)
-                m.d.sync += data_to_host.i.eq(data_to_host.q)
-                m.next = "IDLE"
-
-            with m.State("TX_Q"):
-                m.d.comb += tx_clk_en.eq(1)
-                m.d.sync += tx_dly_write[0].eq(1)  # delayed write
-                m.next = "IDLE"
-
-            with m.State("PRBS"):
-                m.d.sync += host_io.oe.eq(1)
-                m.d.sync += data_to_host.eq(prbs.value)
-                m.d.comb += rx_clk_en.eq(prbs_count == 0)
-                m.d.comb += prbs_advance.eq(prbs_count == 0)
-                m.d.sync += prbs_count.eq(prbs_count + 1)
-                with m.If(~self.prbs):
-                    m.next = "IDLE"
-
-        if self._domain != "sync":
-            m = DomainRenamer(self._domain)(m)
-
-        return m
-
-
-class FlowAndTriggerControl(wiring.Component):
-    trigger_en:  In(1)
-    direction:   Out(1)  # async
-    enable:      Out(1)  # async
-    adc_capture: Out(1)
-    dac_capture: Out(1)
-
-    def __init__(self, domain):
-        super().__init__()
-        self._domain = domain
-
-    def elaborate(self, platform):
-        m = Module()
-
-        #
-        # Signal synchronization and trigger logic.
-        #
-        trigger_enable   = self.trigger_en
-        trigger_in       =  platform.request("trigger_in").i
-        trigger_out      =  platform.request("trigger_out").o
-        host_data_enable = ~platform.request("disable").i
-        m.d.comb += trigger_out.eq(host_data_enable)
-
-        # Create a latch for the trigger input signal using a special FPGA primitive.
-        trigger_in_latched = Signal()
-        trigger_in_reg = Instance("SB_DFFES",
-            i_D = 0,
-            i_S = trigger_in,  # async set
-            i_E = ~host_data_enable,
-            i_C = ClockSignal(self._domain),
-            o_Q = trigger_in_latched
-        )
-        m.submodules.trigger_in_reg = trigger_in_reg
-
-        # Export signals for direction control and capture gating.
-        m.d.comb += self.direction.eq(platform.request("direction").i)
-        m.d.comb += self.enable.eq(host_data_enable)
-        
-        with m.If(host_data_enable):
-            m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0))
-            m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1))
-        with m.Else():
-            m.d[self._domain] += self.adc_capture.eq(0)
-            m.d[self._domain] += self.dac_capture.eq(0)
-
-        return m
+from util                   import ClockConverter, IQSample, StreamSkidBuffer
 
 
 class Top(Elaboratable):
@@ -176,15 +28,10 @@ def elaborate(self, platform):
         m.submodules.clkgen = ClockDomainGenerator()
 
         # Submodules.
-        m.submodules.flow_ctl    = flow_ctl    = FlowAndTriggerControl(domain="gck1")
         m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1")
-        m.submodules.mcu_intf    = mcu_intf    = MCUInterface(domain="sync")
+        m.submodules.mcu_intf    = mcu_intf    = SGPIOInterface(sample_width=16, domain="sync")
 
-        m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture)
-        m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture)
         m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i)
-        m.d.comb += mcu_intf.direction.eq(flow_ctl.direction)
-        m.d.comb += mcu_intf.enable.eq(flow_ctl.enable)
 
         # Half-band filter taps.
         taps = [-2, 0, 7, 0, -18, 0, 41, 0, -92, 0, 320, 512, 320, 0, -92, 0, 41, 0, -18, 0, 7, 0, -2]
@@ -221,7 +68,7 @@ def elaborate(self, platform):
             "hbfir1":        HalfBandDecimator(taps, **common_rx_filter_opts),
 
             # Clock domain conversion.
-            "clkconv":       ClockConverter(IQSample(8), 4, "gck1", "sync"),
+            "clkconv":       ClockConverter(IQSample(8), 8, "gck1", "sync"),
         }
         for k,v in rx_chain.items():
             m.submodules[f"rx_{k}"] = v
@@ -235,7 +82,7 @@ def elaborate(self, platform):
 
         tx_chain = {
             # Clock domain conversion.
-            "clkconv":          ClockConverter(IQSample(8), 4, "sync", "gck1", always_ready=False),
+            "clkconv":          ClockConverter(IQSample(8), 8, "sync", "gck1", always_ready=False), 
 
             # Half-band interpolation stages (+ skid buffers for timing closure).
             "hbfir1":           HalfBandInterpolator(taps, data_shape=fixed.SQ(7), 
@@ -248,6 +95,7 @@ def elaborate(self, platform):
             # CIC interpolation stage.
             "cic_interpolator": CICInterpolator(1, 3, (1, 2, 4, 8), 8, 8, num_channels=2, 
                 always_ready=False, domain="gck1"),
+            "skid4":            DomainRenamer("gck1")(StreamSkidBuffer(IQSample(8), always_ready=False)),
         }
         for k,v in tx_chain.items():
             m.submodules[f"tx_{k}"] = v
@@ -263,7 +111,7 @@ def elaborate(self, platform):
             m.d.comb += [
                 adcdac_intf.dac_stream.p.eq(nco.output),
                 adcdac_intf.dac_stream.valid.eq(1),
-                tx_chain["cic_interpolator"].output.ready.eq(1),
+                last.ready.eq(1),
             ]
         with m.Else():
             connect(m, last, adcdac_intf.dac_stream)
@@ -281,7 +129,7 @@ def elaborate(self, platform):
 
         m.d.sync += [
             # Trigger enable.
-            flow_ctl.trigger_en                 .eq(ctrl[7]),
+            mcu_intf.trigger_en                 .eq(ctrl[7]),
 
             # PRBS enable.
             mcu_intf.prbs                       .eq(ctrl[6]),
diff --git a/firmware/fpga/util/__init__.py b/firmware/fpga/util/__init__.py
index 75334121a..47e88e5e0 100644
--- a/firmware/fpga/util/__init__.py
+++ b/firmware/fpga/util/__init__.py
@@ -35,7 +35,7 @@ def __init__(self, shape, depth, input_domain, output_domain, always_ready=True)
     def elaborate(self, platform):
         m = Module()
 
-        m.submodules.mem = mem = fifo.AsyncFIFO(
+        m.submodules.mem = mem = fifo.AsyncFIFOBuffered(
             width=Shape.cast(self.shape).width,
             depth=self.depth,
             r_domain=self._output_domain,