pulp-platform
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Bender.lock‎
Lines changed: 1 addition & 1 deletion b/‎Bender.lock‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Bender.yml‎
Lines changed: 1 addition & 1 deletion b/‎Bender.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎golden-model/FP16/scripts/pace.py‎
Lines changed: 29 additions & 19 deletions b/‎golden-model/FP16/scripts/pace.py‎
Lines changed: 29 additions & 19 deletions
diff --git a/‎rtl/pace/pace_pingpong_inp.sv‎
Lines changed: 54 additions & 27 deletions b/‎rtl/pace/pace_pingpong_inp.sv‎
Lines changed: 54 additions & 27 deletions
@@ -0,0 +1,9 @@
+**venv
+**.bender
+target/sim/vsim/work/
+sw/build/
+golden-model/**/txt/
+target/sim/vsim/vsim.wlf
+sw/inc/*
+.bender/** */
+target/sim/vsim/transcript**
@@ -86,7 +86,7 @@ packages:
     dependencies:
     - tech_cells_generic
   hwpe-stream:
-    revision: db62a6411a7f3dc2b2a74e202377da118a4a6673
+    revision: 7eb50a7cb37dc2a970e0cfee10aff5d961e41340
     version: null
     source:
       Git: https://github.com/pulp-platform/hwpe-stream.git
 
@@ -15,7 +15,7 @@ dependencies:
   cv32e40p          : { git: "https://github.com/pulp-platform/cv32e40p.git"          , rev: "astral-v1.0"                            }
   cv32e40x          : { git: "https://github.com/pulp-platform/cv32e40x.git"          , rev: "redmule-v1.0"                           }
   ibex              : { git: "https://github.com/pulp-platform/ibex.git"              , rev: pulpissimo-v6.1.2                        }
-  hwpe-stream       : { git: "https://github.com/pulp-platform/hwpe-stream.git"       , rev: db62a6411a7f3dc2b2a74e202377da118a4a6673 } #branch: ab/strb_fix
+  hwpe-stream       : { git: "https://github.com/pulp-platform/hwpe-stream.git"       , rev: 7eb50a7cb37dc2a970e0cfee10aff5d961e41340 } #branch: ab/strb_fix
   hwpe-ctrl         : { git: "https://github.com/pulp-platform/hwpe-ctrl.git"         , rev: 0e95510c0f4d43452d21b7723d766ae92e45c101 } # branch: yt/task-interfaces
   hci               : { git: "https://github.com/pulp-platform/hci.git"               , rev: fa625bdb824209bc2c0faaa6d99ec15ff981473f } # branch: ab/fifo_options
   fpnew             : { git: "https://github.com/pulp-platform/cvfpu.git"             , rev: "pulp-v0.1.3"                            }
 
@@ -6,7 +6,7 @@
 #
 
 import numpy as np
-import os 
+import os
 
 def float16_to_hex(f16_val):
     arr = np.array(f16_val, dtype=np.float16).reshape(())
@@ -15,6 +15,9 @@ def float16_to_hex(f16_val):
 def silu(x):
     return x / (1 + np.exp(-x))
 
+def exp(x):
+    return np.exp(x)
+
 # === BST Partitioning ===
 
 def build_bst_indices(n_partitions):
@@ -61,7 +64,7 @@ def piecewise_poly_approx_bst_fp16(
 
     coeffs = []
     for i in range(partitions):
-        x = np.linspace(raw_bps[i], raw_bps[i + 1], 50).astype(np.float16)
+        x = np.linspace(raw_bps[i], raw_bps[i + 1], 500).astype(np.float16)
         y = func(x.astype(np.float32)).astype(np.float16)
         p = np.polynomial.Polynomial.fit(x.astype(np.float32), y.astype(np.float32), deg=degree,
                                          domain=[float(raw_bps[i]), float(raw_bps[i + 1])])
@@ -73,6 +76,7 @@ def piecewise_poly_approx_bst_fp16(
     y_approx = np.zeros_like(x_vals)
 
     debug_lines = []
+    custom_debug_lines = []
 
     # breakpoint layout
     debug_lines.append("=== Raw Breakpoints (sorted) ===")
@@ -115,23 +119,27 @@ def piecewise_poly_approx_bst_fp16(
         dbg.append(f"  y_approx  = {y_approx[idx]:.5f} ({float16_to_hex(y_approx[idx])})")
         dbg.append(f"  error     = {float(y_true[idx] - y_approx[idx]):.5f}")
         debug_lines.append("\n".join(dbg) + "\n")
+        if idx % 16 == 0:
+            custom_debug_lines.append("\n".join(dbg))
 
     return {
         "x_vals": x_vals,
         "y_true": y_true,
         "y_approx": y_approx,
         "breakpoints_bst": breakpoints_bst,
         "coeffs": coeffs,
-        "debug_lines": debug_lines
+        "debug_lines": debug_lines,
+        "custom_debug_lines": custom_debug_lines
     }
 
 
 def write_debug_output(results, debug_file="execution.txt"):
     with open(debug_file, "w") as f:
-        for line in results["debug_lines"]:
+        for line in results:
             f.write(line + "\n")
     print(f"✅ Debug written to: {debug_file}")
 
+
 def write_coefficients_output(results, coeff_file="coefficients.txt"):
     with open(coeff_file, "w") as f:
         for i, coeffs in enumerate(results["coeffs"]):
@@ -176,18 +184,18 @@ def write_x_file(coeffs, xmin=-6, xmax=6, partitions=8, stimuli_file="x_input.h"
         f_x.write('};\n')
     print(f"✅ x_input header written to: {stimuli_file}")
 
-        
+
 
 def write_inp_inc_file(results, stimuli_file="w_input.h"):
-    size = len(results["x_vals"]) 
+    size = len(results["x_vals"])
     with open(stimuli_file, "w") as f:
         f.write(f' uint16_t w_inp [{size}] =' +'{')
         for i, x in enumerate(results["x_vals"]):
             if i%8==0:
                 f.write('\n')
-            if i == size  - 1: 
+            if i == size  - 1:
                 f.write(f"  {float16_to_hex(x)}\n")
-            else: 
+            else:
                 f.write(f"  {float16_to_hex(x)},")
         f.write('};\n')
     print(f"✅ Stimuli header written to: {stimuli_file}")
@@ -199,8 +207,8 @@ def write_golden_oup_inc_file(results, stimuli_file="golden.h"):
     with open(stimuli_file, "w") as f:
         f.write(f'uint32_t golden[{size}] = {{\n')
         for i in range(0, len(y_approx), 2):
-            low_16 = float16_to_hex(y_approx[i]).removeprefix("0x")
-            high_16 = float16_to_hex(y_approx[i + 1]).removeprefix("0x")
+            low_16 = float16_to_hex(y_approx[i]).replace("0x", "")
+            high_16 = float16_to_hex(y_approx[i + 1]).replace("0x", "")
             combined = f"0x{high_16}{low_16}"
             end_char = ',\n' if i < len(y_approx) - 2 else '\n'
             f.write(f"{combined}{end_char}")
@@ -209,7 +217,7 @@ def write_golden_oup_inc_file(results, stimuli_file="golden.h"):
 
 def write_golden_inc_debug_file(results, stimuli_file="golden_debug.h"):
     y_approx = results["y_approx"]
-    size = len(y_approx) 
+    size = len(y_approx)
 
     with open(stimuli_file, "w") as f:
         f.write(f'uint32_t golden[{size}] = {{')
@@ -245,40 +253,42 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
     f_d.write('#ifndef __TENSOR_DIM__\n'       )
     f_d.write('#define __TENSOR_DIM__\n\n'     )
     f_d.write('#define M_SIZE  8 \n'           )
-    f_d.write('#define N_SIZE  32\n'            )
-    f_d.write(f'#define K_SIZE  {n_tests/8} \n')
+    f_d.write('#define N_SIZE  64\n'            )
+    f_d.write(f'#define K_SIZE  {n_tests/32} \n')
     f_d.write('#define SRC_FMT FP16\n'         )
     f_d.write('#define DST_FMT FP16\n'         )
     f_d.write('#define FPFORMAT 16\n'          )
     f_d.write('uint8_t gemm_ops = PACE; \n'    )
+    f_d.write('uint8_t quant_fmt = 0; \n'      )
     f_d.write('\n#endif\n'                     )
     f_d.close()
 
 
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser("PACE Operation Test")
-    parser.add_argument( '--x_min',     type=int, default=-6 )
-    parser.add_argument( '--x_max',     type=int, default=6 )
+    parser.add_argument( '--x_min',     type=int, default=-11 )
+    parser.add_argument( '--x_max',     type=int, default=0 )
     parser.add_argument( '--f_name',    type=str, default="silu" )
     parser.add_argument( '--n_parts',   type=int, default=8 )
     parser.add_argument( '--n_deg',     type=int, default=4 )
-    parser.add_argument( '--n_tests',   type=int, default=1024 )
+    parser.add_argument( '--n_tests',   type=int, default=4096 )
     parser.add_argument( '--file_name', type=str, default='net_parameters.h')
     parser.add_argument( '--inc_dir',   type=str)
     parser.add_argument( '--txt_dir',   type=str)
     args = parser.parse_args()
     results = piecewise_poly_approx_bst_fp16(
-        silu, xmin=-6, xmax=6, degree=4, partitions=8, n_stimuli=args.n_tests
+        exp, xmin=args.x_min, xmax=args.x_max, degree=4, partitions=8, n_stimuli=args.n_tests
     )
-    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results)
+    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results["debug_lines"])
+    write_debug_output(debug_file=os.path.join(args.txt_dir,"execution_custom.txt"),results=results["custom_debug_lines"])
     write_coefficients_output(coeff_file=os.path.join(args.txt_dir,"coefficients.txt"), results=results)
     write_inp_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "w_input.h"))
     write_golden_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "golden.h"))
     write_actual_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "z_output.h"))
     write_golden_inc_debug_file(results, stimuli_file=os.path.join(args.txt_dir, "golden_debug.h"))
     write_y_inp_inc_file(stimuli_file=os.path.join(args.inc_dir, "y_input.h"))
-    write_x_file(coeffs=results["coeffs"], xmin=-6, xmax=6, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h")) 
+    write_x_file(coeffs=results["coeffs"], xmin=args.x_min, xmax=args.x_max, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h"))
     write_tensor_dim_inc_file(stimuli_file=os.path.join(args.inc_dir, "tensor_dim.h"), n_tests=args.n_tests)
 
 
@@ -6,29 +6,40 @@
 //
 // This module takes a 256b data and splits it to 128b to feed it to the engine for 2 cycles.
 module pace_pingpong_inp #(
-  parameter int unsigned InpDataWidth  = 256,
-  parameter int unsigned NumRows       = 8,
-  parameter int unsigned OupDataWidth  = 16
+  parameter int unsigned InpDataWidth    = 256,
+  parameter int unsigned NumRows         = 8,
+  parameter int unsigned CEOupDataWidth  = 16,
+  localparam int unsigned OupDataWidth   = NumRows * CEOupDataWidth
 ) (
   input  logic                                    clk_i,
   input  logic                                    rst_ni,
   input  logic                                    clear_i,
   input  logic                                    enable_i,
-  output logic [NumRows-1:0][OupDataWidth-1:0]    output_o,
+  output logic [NumRows-1:0][CEOupDataWidth-1:0]  output_o,
   output logic                                    valid_o,
   input  logic                                    ready_i,
   hwpe_stream_intf_stream.sink                    input_i
 );
 
   // Local signals
   hwpe_stream_intf_stream #(
-    .DATA_WIDTH ( InpDataWidth*NumRows )
+    .DATA_WIDTH (InpDataWidth/2 )
   ) ping_pong_buffer [1:0] (
     .clk ( clk_i )
   );
-  logic                            output_handshake;
-  logic [NumRows*OupDataWidth-1:0] output_buffer;
-  logic                            ping_pong_status_d, ping_pong_status_q;
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH ( InpDataWidth/2 )
+  ) output_buffer (
+    .clk ( clk_i )
+  );
+
+  hwpe_stream_intf_stream #(
+    .DATA_WIDTH ( InpDataWidth/2 )
+  ) output_buffer_fifo (
+    .clk ( clk_i )
+  );
+
 
   // Stream splitter
   hwpe_stream_split #(
@@ -42,33 +53,49 @@ module pace_pingpong_inp #(
     .pop_o   ( ping_pong_buffer )
   );
 
-  // Ready/valid handshake
-  assign ping_pong_buffer[0].ready = output_handshake & ping_pong_status_q & enable_i;
-  assign ping_pong_buffer[1].ready = output_handshake & ping_pong_status_q & enable_i;
+  hwpe_stream_package::ctrl_serdes_t ctrl_serdes;
 
-  assign output_buffer = ping_pong_status_q ? ping_pong_buffer[1].data  : ping_pong_buffer[0].data;
-  assign valid_o       = ping_pong_status_q ? ping_pong_buffer[1].valid : ping_pong_buffer[0].valid;
+  assign ctrl_serdes.clear_serdes_state = clear_i;
+  assign ctrl_serdes.nb_contig_m1       = 0;
+  assign ctrl_serdes.first_stream       = 1'b0;
+
+  hwpe_stream_serialize #(
+    .NB_IN_STREAMS ( 2            ),
+    .CONTIG_LIMIT  ( 1024         ),
+    .DATA_WIDTH    ( OupDataWidth ),
+    .SYNC_READY    ( 1'b1         )
+  ) i_hwpe_stream_serialize (
+    .clk_i   ( clk_i            ),
+    .rst_ni  ( rst_ni           ),
+    .clear_i ( clear_i          ),
+    .ctrl_i  ( ctrl_serdes      ),
+    .push_i  ( ping_pong_buffer ),
+    .pop_o   ( output_buffer    )
+  );
+
+  hwpe_stream_fifo #(
+    .DATA_WIDTH ( OupDataWidth ),
+    .FIFO_DEPTH ( 2            ),
+    .LATCH_FIFO ( 0            ),
+    .LATCH_FIFO_TEST_WRAP ( 0  )
+  ) i_hwpe_stream_fifo (
+    .clk_i   ( clk_i              ),
+    .rst_ni  ( rst_ni             ),
+    .clear_i ( clear_i            ),
+    .flags_o (                    ),
+    .push_i  ( output_buffer      ),
+    .pop_o   ( output_buffer_fifo )
+  );
 
   // Output slicing
   generate
     for (genvar r = 0; r < NumRows; r++) begin : gen_output_unpack
-      assign output_o[r] = output_buffer[(OupDataWidth*(r+1))-1 -: OupDataWidth];
+      assign output_o[r] = output_buffer_fifo.data[(CEOupDataWidth*(r+1))-1 -: CEOupDataWidth];
     end
   endgenerate
+  assign valid_o = output_buffer_fifo.valid;
+  assign output_buffer_fifo.ready = ready_i & enable_i;
 
-  // Handshake logic
-  assign output_handshake = valid_o & ready_i;
 
-  // Ping-pong control
-  assign ping_pong_status_d = clear_i             ? 1'b0 :
-                              output_handshake    ? ~ping_pong_status_q :
-                                                    ping_pong_status_q;
-  always_ff @(posedge clk_i or negedge rst_ni) begin : gen_ping_pong_status_ff
-    if (~rst_ni) begin
-      ping_pong_status_q <= 1'b0;
-    end else begin
-      ping_pong_status_q <= ping_pong_status_d;
-    end
-  end
 
 endmodule