Skip to content

Commit 22f3013

Browse files
committed
[pace-fix] refactor ping pong input and output buffer using existing IPs.
Interface is verified with stallability from the memory side. the pace.py test stimuli generation is updated with exp function also.
1 parent a72eeca commit 22f3013

File tree

7 files changed

+199
-80
lines changed

7 files changed

+199
-80
lines changed

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
**venv
2+
**.bender
3+
target/sim/vsim/work/
4+
sw/build/
5+
golden-model/**/txt/
6+
target/sim/vsim/vsim.wlf
7+
sw/inc/*
8+
.bender/** */
9+
target/sim/vsim/transcript**

golden-model/FP16/scripts/pace.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ def float16_to_hex(f16_val):
1515
def silu(x):
1616
return x / (1 + np.exp(-x))
1717

18+
def exp(x):
19+
return np.exp(x)
20+
1821
# === BST Partitioning ===
1922

2023
def build_bst_indices(n_partitions):
@@ -61,7 +64,7 @@ def piecewise_poly_approx_bst_fp16(
6164

6265
coeffs = []
6366
for i in range(partitions):
64-
x = np.linspace(raw_bps[i], raw_bps[i + 1], 50).astype(np.float16)
67+
x = np.linspace(raw_bps[i], raw_bps[i + 1], 500).astype(np.float16)
6568
y = func(x.astype(np.float32)).astype(np.float16)
6669
p = np.polynomial.Polynomial.fit(x.astype(np.float32), y.astype(np.float32), deg=degree,
6770
domain=[float(raw_bps[i]), float(raw_bps[i + 1])])
@@ -73,6 +76,7 @@ def piecewise_poly_approx_bst_fp16(
7376
y_approx = np.zeros_like(x_vals)
7477

7578
debug_lines = []
79+
custom_debug_lines = []
7680

7781
# breakpoint layout
7882
debug_lines.append("=== Raw Breakpoints (sorted) ===")
@@ -115,23 +119,27 @@ def piecewise_poly_approx_bst_fp16(
115119
dbg.append(f" y_approx = {y_approx[idx]:.5f} ({float16_to_hex(y_approx[idx])})")
116120
dbg.append(f" error = {float(y_true[idx] - y_approx[idx]):.5f}")
117121
debug_lines.append("\n".join(dbg) + "\n")
122+
if idx % 16 == 0:
123+
custom_debug_lines.append("\n".join(dbg))
118124

119125
return {
120126
"x_vals": x_vals,
121127
"y_true": y_true,
122128
"y_approx": y_approx,
123129
"breakpoints_bst": breakpoints_bst,
124130
"coeffs": coeffs,
125-
"debug_lines": debug_lines
131+
"debug_lines": debug_lines,
132+
"custom_debug_lines": custom_debug_lines
126133
}
127134

128135

129136
def write_debug_output(results, debug_file="execution.txt"):
130137
with open(debug_file, "w") as f:
131-
for line in results["debug_lines"]:
138+
for line in results:
132139
f.write(line + "\n")
133140
print(f"✅ Debug written to: {debug_file}")
134141

142+
135143
def write_coefficients_output(results, coeff_file="coefficients.txt"):
136144
with open(coeff_file, "w") as f:
137145
for i, coeffs in enumerate(results["coeffs"]):
@@ -245,8 +253,8 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
245253
f_d.write('#ifndef __TENSOR_DIM__\n' )
246254
f_d.write('#define __TENSOR_DIM__\n\n' )
247255
f_d.write('#define M_SIZE 8 \n' )
248-
f_d.write('#define N_SIZE 32\n' )
249-
f_d.write(f'#define K_SIZE {n_tests/8} \n')
256+
f_d.write('#define N_SIZE 64\n' )
257+
f_d.write(f'#define K_SIZE {n_tests/32} \n')
250258
f_d.write('#define SRC_FMT FP16\n' )
251259
f_d.write('#define DST_FMT FP16\n' )
252260
f_d.write('#define FPFORMAT 16\n' )
@@ -258,27 +266,28 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
258266
if __name__ == "__main__":
259267
import argparse
260268
parser = argparse.ArgumentParser("PACE Operation Test")
261-
parser.add_argument( '--x_min', type=int, default=-6 )
262-
parser.add_argument( '--x_max', type=int, default=6 )
269+
parser.add_argument( '--x_min', type=int, default=-11 )
270+
parser.add_argument( '--x_max', type=int, default=0 )
263271
parser.add_argument( '--f_name', type=str, default="silu" )
264272
parser.add_argument( '--n_parts', type=int, default=8 )
265273
parser.add_argument( '--n_deg', type=int, default=4 )
266-
parser.add_argument( '--n_tests', type=int, default=1024 )
274+
parser.add_argument( '--n_tests', type=int, default=4096 )
267275
parser.add_argument( '--file_name', type=str, default='net_parameters.h')
268276
parser.add_argument( '--inc_dir', type=str)
269277
parser.add_argument( '--txt_dir', type=str)
270278
args = parser.parse_args()
271279
results = piecewise_poly_approx_bst_fp16(
272-
silu, xmin=-6, xmax=6, degree=4, partitions=8, n_stimuli=args.n_tests
280+
exp, xmin=args.x_min, xmax=args.x_max, degree=4, partitions=8, n_stimuli=args.n_tests
273281
)
274-
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results)
282+
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results["debug_lines"])
283+
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution_custom.txt"),results=results["custom_debug_lines"])
275284
write_coefficients_output(coeff_file=os.path.join(args.txt_dir,"coefficients.txt"), results=results)
276285
write_inp_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "w_input.h"))
277286
write_golden_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "golden.h"))
278287
write_actual_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "z_output.h"))
279288
write_golden_inc_debug_file(results, stimuli_file=os.path.join(args.txt_dir, "golden_debug.h"))
280289
write_y_inp_inc_file(stimuli_file=os.path.join(args.inc_dir, "y_input.h"))
281-
write_x_file(coeffs=results["coeffs"], xmin=-6, xmax=6, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h"))
290+
write_x_file(coeffs=results["coeffs"], xmin=args.x_min, xmax=args.x_max, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h"))
282291
write_tensor_dim_inc_file(stimuli_file=os.path.join(args.inc_dir, "tensor_dim.h"), n_tests=args.n_tests)
283292

284293

rtl/pace/pace_pingpong_inp.sv

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,23 @@ module pace_pingpong_inp #(
2222

2323
// Local signals
2424
hwpe_stream_intf_stream #(
25-
.DATA_WIDTH ( InpDataWidth*NumRows )
25+
.DATA_WIDTH (InpDataWidth/2 )
2626
) ping_pong_buffer [1:0] (
2727
.clk ( clk_i )
2828
);
29-
logic output_handshake;
30-
logic [NumRows*OupDataWidth-1:0] output_buffer;
31-
logic ping_pong_status_d, ping_pong_status_q;
29+
30+
hwpe_stream_intf_stream #(
31+
.DATA_WIDTH ( InpDataWidth/2 )
32+
) output_buffer (
33+
.clk ( clk_i )
34+
);
35+
36+
hwpe_stream_intf_stream #(
37+
.DATA_WIDTH ( InpDataWidth/2 )
38+
) output_buffer_fifo (
39+
.clk ( clk_i )
40+
);
41+
3242

3343
// Stream splitter
3444
hwpe_stream_split #(
@@ -42,33 +52,49 @@ module pace_pingpong_inp #(
4252
.pop_o ( ping_pong_buffer )
4353
);
4454

45-
// Ready/valid handshake
46-
assign ping_pong_buffer[0].ready = output_handshake & ping_pong_status_q & enable_i;
47-
assign ping_pong_buffer[1].ready = output_handshake & ping_pong_status_q & enable_i;
55+
hwpe_stream_package::ctrl_serdes_t ctrl_serdes;
56+
57+
assign ctrl_serdes.clear_serdes_state = clear_i;
58+
assign ctrl_serdes.nb_contig_m1 = 0;
59+
assign ctrl_serdes.first_stream = 1'b0;
4860

49-
assign output_buffer = ping_pong_status_q ? ping_pong_buffer[1].data : ping_pong_buffer[0].data;
50-
assign valid_o = ping_pong_status_q ? ping_pong_buffer[1].valid : ping_pong_buffer[0].valid;
61+
hwpe_stream_serialize #(
62+
.NB_IN_STREAMS ( 2 ),
63+
.CONTIG_LIMIT ( 1024 ),
64+
.DATA_WIDTH ( NumRows*OupDataWidth ),
65+
.SYNC_READY ( 1'b1 )
66+
) i_hwpe_stream_serialize (
67+
.clk_i ( clk_i ),
68+
.rst_ni ( rst_ni ),
69+
.clear_i ( clear_i ),
70+
.ctrl_i ( ctrl_serdes ),
71+
.push_i ( ping_pong_buffer ),
72+
.pop_o ( output_buffer )
73+
);
74+
75+
hwpe_stream_fifo #(
76+
.DATA_WIDTH ( NumRows*OupDataWidth ),
77+
.FIFO_DEPTH ( 2 ),
78+
.LATCH_FIFO ( 0 ),
79+
.LATCH_FIFO_TEST_WRAP ( 0 )
80+
) i_hwpe_stream_fifo (
81+
.clk_i ( clk_i ),
82+
.rst_ni ( rst_ni ),
83+
.clear_i ( clear_i ),
84+
.flags_o ( ),
85+
.push_i ( output_buffer ),
86+
.pop_o ( output_buffer_fifo )
87+
);
5188

5289
// Output slicing
5390
generate
5491
for (genvar r = 0; r < NumRows; r++) begin : gen_output_unpack
55-
assign output_o[r] = output_buffer[(OupDataWidth*(r+1))-1 -: OupDataWidth];
92+
assign output_o[r] = output_buffer_fifo.data[(OupDataWidth*(r+1))-1 -: OupDataWidth];
5693
end
5794
endgenerate
95+
assign valid_o = output_buffer_fifo.valid;
96+
assign output_buffer_fifo.ready = ready_i & enable_i;
5897

59-
// Handshake logic
60-
assign output_handshake = valid_o & ready_i;
6198

62-
// Ping-pong control
63-
assign ping_pong_status_d = clear_i ? 1'b0 :
64-
output_handshake ? ~ping_pong_status_q :
65-
ping_pong_status_q;
66-
always_ff @(posedge clk_i or negedge rst_ni) begin : gen_ping_pong_status_ff
67-
if (~rst_ni) begin
68-
ping_pong_status_q <= 1'b0;
69-
end else begin
70-
ping_pong_status_q <= ping_pong_status_d;
71-
end
72-
end
7399

74100
endmodule

rtl/pace/pace_pingpong_oup.sv

Lines changed: 103 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010

1111
module pace_pingpong_oup #(
1212
parameter int unsigned NumRows = 8,
13-
parameter int unsigned InpDataWidth = 16
13+
parameter int unsigned InpDataWidth = 16,
14+
localparam int unsigned InputStreamWidth = NumRows*InpDataWidth,
15+
localparam int unsigned NumStreams = 2,
16+
localparam int unsigned OutputStreamWidth = NumStreams*InputStreamWidth
1417
) (
1518
input logic clk_i,
1619
input logic rst_ni,
@@ -22,51 +25,111 @@ module pace_pingpong_oup #(
2225
hwpe_stream_intf_stream.source output_o
2326
);
2427

25-
// Internal signals
26-
logic [NumRows*InpDataWidth-1:0] input_buffer_d, input_buffer_q;
27-
logic [2*NumRows*InpDataWidth-1:0] flattened_oup_buffer;
28-
logic ping_pong_status_d, ping_pong_status_q;
29-
logic input_handshake;
30-
31-
// Handshake
32-
assign input_handshake = valid_i & ready_o;
33-
34-
// Input buffering logic
35-
assign input_buffer_d = clear_i ? '0 :
36-
(input_handshake && ~ping_pong_status_q) ? input_i : input_buffer_q;
37-
38-
// Flatten the buffered output
39-
generate
40-
for (genvar r = 0; r < 2*NumRows; r++) begin : gen_flattened_output
41-
if (r < NumRows) begin : gen_even_entry
42-
assign flattened_oup_buffer[(r+1)*InpDataWidth-1 -: InpDataWidth] = input_buffer_q[(r+1)*InpDataWidth-1 -: InpDataWidth];
43-
end else begin : gen_odd_entry
44-
assign flattened_oup_buffer[(r+1)*InpDataWidth-1 -: InpDataWidth] = input_i[r-NumRows];
45-
end
46-
end
47-
endgenerate
28+
hwpe_stream_intf_stream #(
29+
.DATA_WIDTH (InputStreamWidth)
30+
) input_stream (
31+
.clk ( clk_i )
32+
);
33+
34+
assign input_stream.data = input_i;
35+
assign input_stream.valid = valid_i;
36+
assign input_stream.strb = '1;
37+
assign ready_o = input_stream.ready;
38+
39+
hwpe_stream_intf_stream #(
40+
.DATA_WIDTH (InputStreamWidth)
41+
) input_stream_demux[1:0] (
42+
.clk ( clk_i )
43+
);
44+
45+
hwpe_stream_intf_stream #(
46+
.DATA_WIDTH (InputStreamWidth)
47+
) input_stream_demux_fifo[1:0] (
48+
.clk ( clk_i )
49+
);
50+
51+
hwpe_stream_intf_stream #(
52+
.DATA_WIDTH (InputStreamWidth)
53+
) input_stream_fenced[1:0] (
54+
.clk ( clk_i )
55+
);
56+
4857

49-
// Output assignments
50-
assign output_o.data = flattened_oup_buffer;
51-
assign output_o.valid = valid_i & ping_pong_status_q;
52-
assign output_o.strb = '1;
53-
assign ready_o = enable_i && (((output_o.ready & output_o.valid) && ping_pong_status_q) || ~ping_pong_status_q);
54-
// When the input data is not latched that is ping_pong_status_q == 0
55-
// Else when one input data is latched(ping_pong_status_q == 1) and there is a handhake then one data could be taken
58+
hwpe_stream_intf_stream #(
59+
.DATA_WIDTH (OutputStreamWidth)
60+
) input_stream_merged (
61+
.clk ( clk_i )
62+
);
5663

57-
// Ping-pong status control
58-
assign ping_pong_status_d = clear_i ? 1'b0 :
59-
input_handshake ? ~ping_pong_status_q :
60-
ping_pong_status_q;
64+
logic sel, sel_d, sel_q;
6165

62-
always_ff @(posedge clk_i or negedge rst_ni) begin : ping_pong_status_ff
66+
always_ff @(posedge clk_i or negedge rst_ni) begin
6367
if (~rst_ni) begin
64-
ping_pong_status_q <= 1'b0;
65-
input_buffer_q <= '0;
68+
sel_q <= 1'b0;
6669
end else begin
67-
ping_pong_status_q <= ping_pong_status_d;
68-
input_buffer_q <= input_buffer_d;
70+
sel_q <= sel_d;
6971
end
7072
end
7173

74+
assign sel = sel_q;
75+
76+
always_comb begin
77+
sel_d = sel_q;
78+
if(clear_i) begin
79+
sel_d = 1'b0;
80+
end else begin
81+
sel_d = input_stream.valid & input_stream.ready ? ~sel_q : sel_q;
82+
end
83+
end
84+
85+
hwpe_stream_demux_static #(
86+
.NB_OUT_STREAMS(NumStreams)
87+
) i_demux (
88+
.clk_i (clk_i),
89+
.rst_ni (rst_ni),
90+
.clear_i(clear_i),
91+
.sel_i (sel),
92+
.push_i(input_stream),
93+
.pop_o(input_stream_demux)
94+
);
95+
96+
genvar ii;
97+
generate
98+
for(ii=0; ii<NumStreams; ii++) begin : gen_fifo
99+
hwpe_stream_fifo #(
100+
.DATA_WIDTH(InputStreamWidth),
101+
.FIFO_DEPTH(2)
102+
) i_fifo (
103+
.clk_i(clk_i),
104+
.rst_ni(rst_ni),
105+
.clear_i(clear_i),
106+
.flags_o(),
107+
.push_i(input_stream_demux[ii]),
108+
.pop_o(input_stream_demux_fifo[ii])
109+
);
110+
end : gen_fifo
111+
endgenerate
112+
113+
hwpe_stream_fence #(
114+
.NB_STREAMS(NumStreams),
115+
.DATA_WIDTH(InputStreamWidth)
116+
) i_fence (
117+
.clk_i(clk_i),
118+
.rst_ni(rst_ni),
119+
.clear_i(clear_i),
120+
.test_mode_i(1'b0),
121+
.push_i(input_stream_demux_fifo),
122+
.pop_o(input_stream_fenced)
123+
);
124+
125+
hwpe_stream_merge #(
126+
.NB_IN_STREAMS(NumStreams),
127+
.DATA_WIDTH_IN(InputStreamWidth)
128+
) i_merge (
129+
.clk_i(clk_i),
130+
.rst_ni(rst_ni),
131+
.clear_i(clear_i),
132+
.push_i(input_stream_fenced),
133+
.pop_o(output_o)
134+
);
72135
endmodule

rtl/pace/pace_xmux.sv

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ module pace_xmux
2828
localparam int GroupSize = 1 << w;
2929
if (w < PACE_PART_BST_STAGES) begin : gen_along_col_BST_part_stages
3030
for (genvar h = 0; h < H; h++) begin : gen_along_col
31+
localparam rhs_h = h % PACE_NPARTS;
3132
if (GroupSize == 1) begin : gen_single_group
32-
assign pace_mux_output[h][w] = x_input_i[h][w];
33+
assign pace_mux_output[h][w] = x_input_i[rhs_h][w];
3334
end else begin : gen_grouped
3435
localparam int GroupIndex = (h % (PACE_PART_BST_STAGES+PACE_NPOLY+1)) / GroupSize;
3536
logic [BITW-1:0] x_flat_array [GroupSize];

0 commit comments

Comments
 (0)