[pace-fix] refactor ping pong input and output buffer using existing IPs.

arpansur · arpansur · commit 22f30133d028 · 2025-07-15T13:43:11.000+02:00
Interface is verified with stallability from the memory side.
the pace.py test stimuli generation is updated with exp function also.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+**venv
+**.bender
+target/sim/vsim/work/
+sw/build/
+golden-model/**/txt/
+target/sim/vsim/vsim.wlf
+sw/inc/*
+.bender/** */
+target/sim/vsim/transcript**
diff --git a/golden-model/FP16/scripts/pace.py b/golden-model/FP16/scripts/pace.py
@@ -15,6 +15,9 @@ def float16_to_hex(f16_val):
 def silu(x):
     return x / (1 + np.exp(-x))
 
+def exp(x):
+    return np.exp(x)
+
 # === BST Partitioning ===
 
 def build_bst_indices(n_partitions):
@@ -61,7 +64,7 @@ def piecewise_poly_approx_bst_fp16(
 
     coeffs = []
     for i in range(partitions):
-        x = np.linspace(raw_bps[i], raw_bps[i + 1], 50).astype(np.float16)
+        x = np.linspace(raw_bps[i], raw_bps[i + 1], 500).astype(np.float16)
         y = func(x.astype(np.float32)).astype(np.float16)
         p = np.polynomial.Polynomial.fit(x.astype(np.float32), y.astype(np.float32), deg=degree,
                                          domain=[float(raw_bps[i]), float(raw_bps[i + 1])])
@@ -73,6 +76,7 @@ def piecewise_poly_approx_bst_fp16(
     y_approx = np.zeros_like(x_vals)
 
     debug_lines = []
+    custom_debug_lines = []
 
     # breakpoint layout
     debug_lines.append("=== Raw Breakpoints (sorted) ===")
@@ -115,23 +119,27 @@ def piecewise_poly_approx_bst_fp16(
         dbg.append(f"  y_approx  = {y_approx[idx]:.5f} ({float16_to_hex(y_approx[idx])})")
         dbg.append(f"  error     = {float(y_true[idx] - y_approx[idx]):.5f}")
         debug_lines.append("\n".join(dbg) + "\n")
+        if idx % 16 == 0:
+            custom_debug_lines.append("\n".join(dbg))
 
     return {
         "x_vals": x_vals,
         "y_true": y_true,
         "y_approx": y_approx,
         "breakpoints_bst": breakpoints_bst,
         "coeffs": coeffs,
-        "debug_lines": debug_lines
+        "debug_lines": debug_lines,
+        "custom_debug_lines": custom_debug_lines
     }
 
 
 def write_debug_output(results, debug_file="execution.txt"):
     with open(debug_file, "w") as f:
-        for line in results["debug_lines"]:
+        for line in results:
             f.write(line + "\n")
     print(f"✅ Debug written to: {debug_file}")
 
+
 def write_coefficients_output(results, coeff_file="coefficients.txt"):
     with open(coeff_file, "w") as f:
         for i, coeffs in enumerate(results["coeffs"]):
@@ -245,8 +253,8 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
     f_d.write('#ifndef __TENSOR_DIM__\n'       )
     f_d.write('#define __TENSOR_DIM__\n\n'     )
     f_d.write('#define M_SIZE  8 \n'           )
-    f_d.write('#define N_SIZE  32\n'            )
-    f_d.write(f'#define K_SIZE  {n_tests/8} \n')
+    f_d.write('#define N_SIZE  64\n'            )
+    f_d.write(f'#define K_SIZE  {n_tests/32} \n')
     f_d.write('#define SRC_FMT FP16\n'         )
     f_d.write('#define DST_FMT FP16\n'         )
     f_d.write('#define FPFORMAT 16\n'          )
@@ -258,27 +266,28 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser("PACE Operation Test")
-    parser.add_argument( '--x_min',     type=int, default=-6 )
-    parser.add_argument( '--x_max',     type=int, default=6 )
+    parser.add_argument( '--x_min',     type=int, default=-11 )
+    parser.add_argument( '--x_max',     type=int, default=0 )
     parser.add_argument( '--f_name',    type=str, default="silu" )
     parser.add_argument( '--n_parts',   type=int, default=8 )
     parser.add_argument( '--n_deg',     type=int, default=4 )
-    parser.add_argument( '--n_tests',   type=int, default=1024 )
+    parser.add_argument( '--n_tests',   type=int, default=4096 )
     parser.add_argument( '--file_name', type=str, default='net_parameters.h')
     parser.add_argument( '--inc_dir',   type=str)
     parser.add_argument( '--txt_dir',   type=str)
     args = parser.parse_args()
     results = piecewise_poly_approx_bst_fp16(
-        silu, xmin=-6, xmax=6, degree=4, partitions=8, n_stimuli=args.n_tests
+        exp, xmin=args.x_min, xmax=args.x_max, degree=4, partitions=8, n_stimuli=args.n_tests
     )
-    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results)
+    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results["debug_lines"])
+    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution_custom.txt"),results=results["custom_debug_lines"])
     write_coefficients_output(coeff_file=os.path.join(args.txt_dir,"coefficients.txt"), results=results)
     write_inp_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "w_input.h"))
     write_golden_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "golden.h"))
     write_actual_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "z_output.h"))
     write_golden_inc_debug_file(results, stimuli_file=os.path.join(args.txt_dir, "golden_debug.h"))
     write_y_inp_inc_file(stimuli_file=os.path.join(args.inc_dir, "y_input.h"))
-    write_x_file(coeffs=results["coeffs"], xmin=-6, xmax=6, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h")) 
+    write_x_file(coeffs=results["coeffs"], xmin=args.x_min, xmax=args.x_max, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h")) 
     write_tensor_dim_inc_file(stimuli_file=os.path.join(args.inc_dir, "tensor_dim.h"), n_tests=args.n_tests)
 
 
diff --git a/rtl/pace/pace_pingpong_inp.sv b/rtl/pace/pace_pingpong_inp.sv
@@ -22,13 +22,23 @@ module pace_pingpong_inp #(
 
   // Local signals
   hwpe_stream_intf_stream #(
-    .DATA_WIDTH ( InpDataWidth*NumRows )
+    .DATA_WIDTH (InpDataWidth/2 )
   ) ping_pong_buffer [1:0] (
     .clk ( clk_i )
   );
-  logic                            output_handshake;
-  logic [NumRows*OupDataWidth-1:0] output_buffer;
-  logic                            ping_pong_status_d, ping_pong_status_q;
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH ( InpDataWidth/2 )
+  ) output_buffer (
+    .clk ( clk_i )
+  );
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH ( InpDataWidth/2 )
+  ) output_buffer_fifo (
+    .clk ( clk_i )
+  );
+
 
   // Stream splitter
   hwpe_stream_split #(
@@ -42,33 +52,49 @@ module pace_pingpong_inp #(
     .pop_o   ( ping_pong_buffer )
   );
 
-  // Ready/valid handshake
-  assign ping_pong_buffer[0].ready = output_handshake & ping_pong_status_q & enable_i;
-  assign ping_pong_buffer[1].ready = output_handshake & ping_pong_status_q & enable_i;
+  hwpe_stream_package::ctrl_serdes_t           ctrl_serdes;
+
+  assign ctrl_serdes.clear_serdes_state = clear_i;
+  assign ctrl_serdes.nb_contig_m1 = 0;
+  assign ctrl_serdes.first_stream = 1'b0;
 
-  assign output_buffer = ping_pong_status_q ? ping_pong_buffer[1].data  : ping_pong_buffer[0].data;
-  assign valid_o       = ping_pong_status_q ? ping_pong_buffer[1].valid : ping_pong_buffer[0].valid;
+  hwpe_stream_serialize #(
+    .NB_IN_STREAMS ( 2       ),
+    .CONTIG_LIMIT  ( 1024       ),
+    .DATA_WIDTH    ( NumRows*OupDataWidth ),
+    .SYNC_READY    ( 1'b1    )
+  ) i_hwpe_stream_serialize (
+    .clk_i   ( clk_i            ),
+    .rst_ni  ( rst_ni           ),
+    .clear_i ( clear_i          ),
+    .ctrl_i  ( ctrl_serdes      ),
+    .push_i  ( ping_pong_buffer ),
+    .pop_o   ( output_buffer    )
+  );
+
+  hwpe_stream_fifo #(
+    .DATA_WIDTH ( NumRows*OupDataWidth ),
+    .FIFO_DEPTH ( 2                     ),
+    .LATCH_FIFO ( 0                     ),
+    .LATCH_FIFO_TEST_WRAP ( 0          )
+  ) i_hwpe_stream_fifo (
+    .clk_i   ( clk_i            ),
+    .rst_ni  ( rst_ni           ),
+    .clear_i ( clear_i          ),
+    .flags_o (        ),
+    .push_i  ( output_buffer   ),
+    .pop_o   ( output_buffer_fifo )
+  );
 
   // Output slicing
   generate
     for (genvar r = 0; r < NumRows; r++) begin : gen_output_unpack
-      assign output_o[r] = output_buffer[(OupDataWidth*(r+1))-1 -: OupDataWidth];
+      assign output_o[r] = output_buffer_fifo.data[(OupDataWidth*(r+1))-1 -: OupDataWidth];
     end
   endgenerate
+  assign valid_o = output_buffer_fifo.valid;
+  assign output_buffer_fifo.ready = ready_i & enable_i;
 
-  // Handshake logic
-  assign output_handshake = valid_o & ready_i;
 
-  // Ping-pong control
-  assign ping_pong_status_d = clear_i             ? 1'b0 :
-                              output_handshake    ? ~ping_pong_status_q :
-                                                    ping_pong_status_q;
-  always_ff @(posedge clk_i or negedge rst_ni) begin : gen_ping_pong_status_ff
-    if (~rst_ni) begin
-      ping_pong_status_q <= 1'b0;
-    end else begin
-      ping_pong_status_q <= ping_pong_status_d;
-    end
-  end
 
 endmodule
diff --git a/rtl/pace/pace_pingpong_oup.sv b/rtl/pace/pace_pingpong_oup.sv
@@ -10,7 +10,10 @@
 
 module pace_pingpong_oup #(
   parameter int unsigned NumRows        = 8,
-  parameter int unsigned InpDataWidth   = 16
+  parameter int unsigned InpDataWidth   = 16,
+  localparam int unsigned InputStreamWidth = NumRows*InpDataWidth,
+  localparam int unsigned NumStreams = 2,
+  localparam int unsigned OutputStreamWidth = NumStreams*InputStreamWidth
 ) (
   input  logic                                 clk_i,
   input  logic                                 rst_ni,
@@ -22,51 +25,111 @@ module pace_pingpong_oup #(
   hwpe_stream_intf_stream.source               output_o
 );
 
-  // Internal signals
-  logic [NumRows*InpDataWidth-1:0]    input_buffer_d, input_buffer_q;
-  logic [2*NumRows*InpDataWidth-1:0]  flattened_oup_buffer;
-  logic                               ping_pong_status_d, ping_pong_status_q;
-  logic                               input_handshake;
-
-  // Handshake
-  assign input_handshake = valid_i & ready_o;
-
-  // Input buffering logic
-  assign input_buffer_d = clear_i ? '0 :
-                          (input_handshake && ~ping_pong_status_q) ? input_i : input_buffer_q;
-
-  // Flatten the buffered output
-  generate
-    for (genvar r = 0; r < 2*NumRows; r++) begin : gen_flattened_output
-      if (r < NumRows) begin : gen_even_entry
-        assign flattened_oup_buffer[(r+1)*InpDataWidth-1 -: InpDataWidth] = input_buffer_q[(r+1)*InpDataWidth-1 -: InpDataWidth];
-      end else begin : gen_odd_entry
-        assign flattened_oup_buffer[(r+1)*InpDataWidth-1 -: InpDataWidth] = input_i[r-NumRows];
-      end
-    end
-  endgenerate
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH (InputStreamWidth)
+  ) input_stream (
+    .clk ( clk_i )
+  );
+
+  assign input_stream.data   = input_i;
+  assign input_stream.valid  = valid_i;
+  assign input_stream.strb  = '1;
+  assign ready_o = input_stream.ready; 
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH (InputStreamWidth)
+  ) input_stream_demux[1:0] (
+    .clk ( clk_i )
+  );
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH (InputStreamWidth)
+  ) input_stream_demux_fifo[1:0] (
+    .clk ( clk_i )
+  );
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH (InputStreamWidth)
+  ) input_stream_fenced[1:0] (
+    .clk ( clk_i )
+  );
+
 
-  // Output assignments
-  assign output_o.data  = flattened_oup_buffer;
-  assign output_o.valid = valid_i & ping_pong_status_q;
-  assign output_o.strb  = '1;
-  assign ready_o        = enable_i && (((output_o.ready & output_o.valid) && ping_pong_status_q) || ~ping_pong_status_q);
-  // When the input data is not latched that is ping_pong_status_q == 0
-  // Else when one input data is latched(ping_pong_status_q == 1) and there is a handhake then one data could be taken
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH (OutputStreamWidth)
+  ) input_stream_merged (
+    .clk ( clk_i )
+  );
 
-  // Ping-pong status control
-  assign ping_pong_status_d = clear_i         ? 1'b0 :
-                              input_handshake ? ~ping_pong_status_q :
-                                                ping_pong_status_q;
+  logic sel, sel_d, sel_q; 
 
-  always_ff @(posedge clk_i or negedge rst_ni) begin : ping_pong_status_ff
+  always_ff @(posedge clk_i or negedge rst_ni) begin
     if (~rst_ni) begin
-      ping_pong_status_q <= 1'b0;
-      input_buffer_q     <= '0;
+      sel_q <= 1'b0;
     end else begin
-      ping_pong_status_q <= ping_pong_status_d;
-      input_buffer_q     <= input_buffer_d;
+      sel_q <= sel_d;
     end
   end
 
+  assign sel = sel_q;
+
+  always_comb begin 
+    sel_d = sel_q; 
+    if(clear_i) begin
+      sel_d = 1'b0; 
+    end else begin 
+      sel_d = input_stream.valid & input_stream.ready ? ~sel_q : sel_q;
+    end 
+  end
+
+  hwpe_stream_demux_static #(
+    .NB_OUT_STREAMS(NumStreams)
+  ) i_demux (
+    .clk_i (clk_i),
+    .rst_ni (rst_ni),
+    .clear_i(clear_i),
+    .sel_i (sel),
+    .push_i(input_stream),
+    .pop_o(input_stream_demux)
+  );
+
+  genvar ii; 
+  generate 
+    for(ii=0; ii<NumStreams; ii++) begin : gen_fifo
+      hwpe_stream_fifo #(
+        .DATA_WIDTH(InputStreamWidth),
+        .FIFO_DEPTH(2)
+      ) i_fifo (
+        .clk_i(clk_i),
+        .rst_ni(rst_ni),
+        .clear_i(clear_i),
+        .flags_o(),
+        .push_i(input_stream_demux[ii]),
+        .pop_o(input_stream_demux_fifo[ii])
+      );
+    end : gen_fifo
+  endgenerate
+
+  hwpe_stream_fence #(
+    .NB_STREAMS(NumStreams),
+    .DATA_WIDTH(InputStreamWidth)
+  ) i_fence (
+    .clk_i(clk_i),
+    .rst_ni(rst_ni),
+    .clear_i(clear_i),
+    .test_mode_i(1'b0),
+    .push_i(input_stream_demux_fifo),
+    .pop_o(input_stream_fenced)
+  );
+
+  hwpe_stream_merge #(
+    .NB_IN_STREAMS(NumStreams), 
+    .DATA_WIDTH_IN(InputStreamWidth)
+  ) i_merge (
+    .clk_i(clk_i),
+    .rst_ni(rst_ni),
+    .clear_i(clear_i),
+    .push_i(input_stream_fenced),
+    .pop_o(output_o)
+  );
 endmodule
diff --git a/rtl/pace/pace_xmux.sv b/rtl/pace/pace_xmux.sv
@@ -28,8 +28,9 @@ module pace_xmux
       localparam int GroupSize = 1 << w;
       if (w < PACE_PART_BST_STAGES) begin : gen_along_col_BST_part_stages
         for (genvar h = 0; h < H; h++) begin : gen_along_col
+            localparam rhs_h = h % PACE_NPARTS;
           if (GroupSize == 1) begin : gen_single_group
-            assign pace_mux_output[h][w] = x_input_i[h][w];
+            assign pace_mux_output[h][w] = x_input_i[rhs_h][w];
           end else begin : gen_grouped
             localparam int GroupIndex = (h % (PACE_PART_BST_STAGES+PACE_NPOLY+1)) / GroupSize;
             logic [BITW-1:0] x_flat_array [GroupSize];
diff --git a/rtl/redmule_row.sv b/rtl/redmule_row.sv
diff --git a/sw/redmule.c b/sw/redmule.c