Skip to content

Commit 8639ec7

Browse files
authored
Merge pull request #52 from pulp-platform/prasadar/picobello
[pace-fix] many fixes related to interface issues
2 parents 8556300 + 1a364b6 commit 8639ec7

File tree

12 files changed

+219
-98
lines changed

12 files changed

+219
-98
lines changed

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
**venv
2+
**.bender
3+
target/sim/vsim/work/
4+
sw/build/
5+
golden-model/**/txt/
6+
target/sim/vsim/vsim.wlf
7+
sw/inc/*
8+
.bender/** */
9+
target/sim/vsim/transcript**

Bender.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ packages:
8686
dependencies:
8787
- tech_cells_generic
8888
hwpe-stream:
89-
revision: db62a6411a7f3dc2b2a74e202377da118a4a6673
89+
revision: 7eb50a7cb37dc2a970e0cfee10aff5d961e41340
9090
version: null
9191
source:
9292
Git: https://github.com/pulp-platform/hwpe-stream.git

Bender.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: "astral-v1.0" }
1616
cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: "redmule-v1.0" }
1717
ibex : { git: "https://github.com/pulp-platform/ibex.git" , rev: pulpissimo-v6.1.2 }
18-
hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , rev: db62a6411a7f3dc2b2a74e202377da118a4a6673 } #branch: ab/strb_fix
18+
hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , rev: 7eb50a7cb37dc2a970e0cfee10aff5d961e41340 } #branch: ab/strb_fix
1919
hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , rev: 0e95510c0f4d43452d21b7723d766ae92e45c101 } # branch: yt/task-interfaces
2020
hci : { git: "https://github.com/pulp-platform/hci.git" , rev: fa625bdb824209bc2c0faaa6d99ec15ff981473f } # branch: ab/fifo_options
2121
fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: "pulp-v0.1.3" }

golden-model/FP16/scripts/pace.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#
77

88
import numpy as np
9-
import os
9+
import os
1010

1111
def float16_to_hex(f16_val):
1212
arr = np.array(f16_val, dtype=np.float16).reshape(())
@@ -15,6 +15,9 @@ def float16_to_hex(f16_val):
1515
def silu(x):
1616
return x / (1 + np.exp(-x))
1717

18+
def exp(x):
19+
return np.exp(x)
20+
1821
# === BST Partitioning ===
1922

2023
def build_bst_indices(n_partitions):
@@ -61,7 +64,7 @@ def piecewise_poly_approx_bst_fp16(
6164

6265
coeffs = []
6366
for i in range(partitions):
64-
x = np.linspace(raw_bps[i], raw_bps[i + 1], 50).astype(np.float16)
67+
x = np.linspace(raw_bps[i], raw_bps[i + 1], 500).astype(np.float16)
6568
y = func(x.astype(np.float32)).astype(np.float16)
6669
p = np.polynomial.Polynomial.fit(x.astype(np.float32), y.astype(np.float32), deg=degree,
6770
domain=[float(raw_bps[i]), float(raw_bps[i + 1])])
@@ -73,6 +76,7 @@ def piecewise_poly_approx_bst_fp16(
7376
y_approx = np.zeros_like(x_vals)
7477

7578
debug_lines = []
79+
custom_debug_lines = []
7680

7781
# breakpoint layout
7882
debug_lines.append("=== Raw Breakpoints (sorted) ===")
@@ -115,23 +119,27 @@ def piecewise_poly_approx_bst_fp16(
115119
dbg.append(f" y_approx = {y_approx[idx]:.5f} ({float16_to_hex(y_approx[idx])})")
116120
dbg.append(f" error = {float(y_true[idx] - y_approx[idx]):.5f}")
117121
debug_lines.append("\n".join(dbg) + "\n")
122+
if idx % 16 == 0:
123+
custom_debug_lines.append("\n".join(dbg))
118124

119125
return {
120126
"x_vals": x_vals,
121127
"y_true": y_true,
122128
"y_approx": y_approx,
123129
"breakpoints_bst": breakpoints_bst,
124130
"coeffs": coeffs,
125-
"debug_lines": debug_lines
131+
"debug_lines": debug_lines,
132+
"custom_debug_lines": custom_debug_lines
126133
}
127134

128135

129136
def write_debug_output(results, debug_file="execution.txt"):
130137
with open(debug_file, "w") as f:
131-
for line in results["debug_lines"]:
138+
for line in results:
132139
f.write(line + "\n")
133140
print(f"✅ Debug written to: {debug_file}")
134141

142+
135143
def write_coefficients_output(results, coeff_file="coefficients.txt"):
136144
with open(coeff_file, "w") as f:
137145
for i, coeffs in enumerate(results["coeffs"]):
@@ -176,18 +184,18 @@ def write_x_file(coeffs, xmin=-6, xmax=6, partitions=8, stimuli_file="x_input.h"
176184
f_x.write('};\n')
177185
print(f"✅ x_input header written to: {stimuli_file}")
178186

179-
187+
180188

181189
def write_inp_inc_file(results, stimuli_file="w_input.h"):
182-
size = len(results["x_vals"])
190+
size = len(results["x_vals"])
183191
with open(stimuli_file, "w") as f:
184192
f.write(f' uint16_t w_inp [{size}] =' +'{')
185193
for i, x in enumerate(results["x_vals"]):
186194
if i%8==0:
187195
f.write('\n')
188-
if i == size - 1:
196+
if i == size - 1:
189197
f.write(f" {float16_to_hex(x)}\n")
190-
else:
198+
else:
191199
f.write(f" {float16_to_hex(x)},")
192200
f.write('};\n')
193201
print(f"✅ Stimuli header written to: {stimuli_file}")
@@ -199,8 +207,8 @@ def write_golden_oup_inc_file(results, stimuli_file="golden.h"):
199207
with open(stimuli_file, "w") as f:
200208
f.write(f'uint32_t golden[{size}] = {{\n')
201209
for i in range(0, len(y_approx), 2):
202-
low_16 = float16_to_hex(y_approx[i]).removeprefix("0x")
203-
high_16 = float16_to_hex(y_approx[i + 1]).removeprefix("0x")
210+
low_16 = float16_to_hex(y_approx[i]).replace("0x", "")
211+
high_16 = float16_to_hex(y_approx[i + 1]).replace("0x", "")
204212
combined = f"0x{high_16}{low_16}"
205213
end_char = ',\n' if i < len(y_approx) - 2 else '\n'
206214
f.write(f"{combined}{end_char}")
@@ -209,7 +217,7 @@ def write_golden_oup_inc_file(results, stimuli_file="golden.h"):
209217

210218
def write_golden_inc_debug_file(results, stimuli_file="golden_debug.h"):
211219
y_approx = results["y_approx"]
212-
size = len(y_approx)
220+
size = len(y_approx)
213221

214222
with open(stimuli_file, "w") as f:
215223
f.write(f'uint32_t golden[{size}] = {{')
@@ -245,40 +253,42 @@ def write_tensor_dim_inc_file(stimuli_file = "tensor_dim.h", n_tests=1000):
245253
f_d.write('#ifndef __TENSOR_DIM__\n' )
246254
f_d.write('#define __TENSOR_DIM__\n\n' )
247255
f_d.write('#define M_SIZE 8 \n' )
248-
f_d.write('#define N_SIZE 32\n' )
249-
f_d.write(f'#define K_SIZE {n_tests/8} \n')
256+
f_d.write('#define N_SIZE 64\n' )
257+
f_d.write(f'#define K_SIZE {n_tests/32} \n')
250258
f_d.write('#define SRC_FMT FP16\n' )
251259
f_d.write('#define DST_FMT FP16\n' )
252260
f_d.write('#define FPFORMAT 16\n' )
253261
f_d.write('uint8_t gemm_ops = PACE; \n' )
262+
f_d.write('uint8_t quant_fmt = 0; \n' )
254263
f_d.write('\n#endif\n' )
255264
f_d.close()
256265

257266

258267
if __name__ == "__main__":
259268
import argparse
260269
parser = argparse.ArgumentParser("PACE Operation Test")
261-
parser.add_argument( '--x_min', type=int, default=-6 )
262-
parser.add_argument( '--x_max', type=int, default=6 )
270+
parser.add_argument( '--x_min', type=int, default=-11 )
271+
parser.add_argument( '--x_max', type=int, default=0 )
263272
parser.add_argument( '--f_name', type=str, default="silu" )
264273
parser.add_argument( '--n_parts', type=int, default=8 )
265274
parser.add_argument( '--n_deg', type=int, default=4 )
266-
parser.add_argument( '--n_tests', type=int, default=1024 )
275+
parser.add_argument( '--n_tests', type=int, default=4096 )
267276
parser.add_argument( '--file_name', type=str, default='net_parameters.h')
268277
parser.add_argument( '--inc_dir', type=str)
269278
parser.add_argument( '--txt_dir', type=str)
270279
args = parser.parse_args()
271280
results = piecewise_poly_approx_bst_fp16(
272-
silu, xmin=-6, xmax=6, degree=4, partitions=8, n_stimuli=args.n_tests
281+
exp, xmin=args.x_min, xmax=args.x_max, degree=4, partitions=8, n_stimuli=args.n_tests
273282
)
274-
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results)
283+
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution.txt"),results=results["debug_lines"])
284+
write_debug_output(debug_file=os.path.join(args.txt_dir,"execution_custom.txt"),results=results["custom_debug_lines"])
275285
write_coefficients_output(coeff_file=os.path.join(args.txt_dir,"coefficients.txt"), results=results)
276286
write_inp_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "w_input.h"))
277287
write_golden_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "golden.h"))
278288
write_actual_oup_inc_file(results, stimuli_file=os.path.join(args.inc_dir, "z_output.h"))
279289
write_golden_inc_debug_file(results, stimuli_file=os.path.join(args.txt_dir, "golden_debug.h"))
280290
write_y_inp_inc_file(stimuli_file=os.path.join(args.inc_dir, "y_input.h"))
281-
write_x_file(coeffs=results["coeffs"], xmin=-6, xmax=6, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h"))
291+
write_x_file(coeffs=results["coeffs"], xmin=args.x_min, xmax=args.x_max, partitions=8, stimuli_file=os.path.join(args.inc_dir, "x_input.h"))
282292
write_tensor_dim_inc_file(stimuli_file=os.path.join(args.inc_dir, "tensor_dim.h"), n_tests=args.n_tests)
283293

284294

rtl/pace/pace_pingpong_inp.sv

Lines changed: 54 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,40 @@
66
//
77
// This module takes a 256b data and splits it to 128b to feed it to the engine for 2 cycles.
88
module pace_pingpong_inp #(
9-
parameter int unsigned InpDataWidth = 256,
10-
parameter int unsigned NumRows = 8,
11-
parameter int unsigned OupDataWidth = 16
9+
parameter int unsigned InpDataWidth = 256,
10+
parameter int unsigned NumRows = 8,
11+
parameter int unsigned CEOupDataWidth = 16,
12+
localparam int unsigned OupDataWidth = NumRows * CEOupDataWidth
1213
) (
1314
input logic clk_i,
1415
input logic rst_ni,
1516
input logic clear_i,
1617
input logic enable_i,
17-
output logic [NumRows-1:0][OupDataWidth-1:0] output_o,
18+
output logic [NumRows-1:0][CEOupDataWidth-1:0] output_o,
1819
output logic valid_o,
1920
input logic ready_i,
2021
hwpe_stream_intf_stream.sink input_i
2122
);
2223

2324
// Local signals
2425
hwpe_stream_intf_stream #(
25-
.DATA_WIDTH ( InpDataWidth*NumRows )
26+
.DATA_WIDTH (InpDataWidth/2 )
2627
) ping_pong_buffer [1:0] (
2728
.clk ( clk_i )
2829
);
29-
logic output_handshake;
30-
logic [NumRows*OupDataWidth-1:0] output_buffer;
31-
logic ping_pong_status_d, ping_pong_status_q;
30+
31+
hwpe_stream_intf_stream #(
32+
.DATA_WIDTH ( InpDataWidth/2 )
33+
) output_buffer (
34+
.clk ( clk_i )
35+
);
36+
37+
hwpe_stream_intf_stream #(
38+
.DATA_WIDTH ( InpDataWidth/2 )
39+
) output_buffer_fifo (
40+
.clk ( clk_i )
41+
);
42+
3243

3344
// Stream splitter
3445
hwpe_stream_split #(
@@ -42,33 +53,49 @@ module pace_pingpong_inp #(
4253
.pop_o ( ping_pong_buffer )
4354
);
4455

45-
// Ready/valid handshake
46-
assign ping_pong_buffer[0].ready = output_handshake & ping_pong_status_q & enable_i;
47-
assign ping_pong_buffer[1].ready = output_handshake & ping_pong_status_q & enable_i;
56+
hwpe_stream_package::ctrl_serdes_t ctrl_serdes;
4857

49-
assign output_buffer = ping_pong_status_q ? ping_pong_buffer[1].data : ping_pong_buffer[0].data;
50-
assign valid_o = ping_pong_status_q ? ping_pong_buffer[1].valid : ping_pong_buffer[0].valid;
58+
assign ctrl_serdes.clear_serdes_state = clear_i;
59+
assign ctrl_serdes.nb_contig_m1 = 0;
60+
assign ctrl_serdes.first_stream = 1'b0;
61+
62+
hwpe_stream_serialize #(
63+
.NB_IN_STREAMS ( 2 ),
64+
.CONTIG_LIMIT ( 1024 ),
65+
.DATA_WIDTH ( OupDataWidth ),
66+
.SYNC_READY ( 1'b1 )
67+
) i_hwpe_stream_serialize (
68+
.clk_i ( clk_i ),
69+
.rst_ni ( rst_ni ),
70+
.clear_i ( clear_i ),
71+
.ctrl_i ( ctrl_serdes ),
72+
.push_i ( ping_pong_buffer ),
73+
.pop_o ( output_buffer )
74+
);
75+
76+
hwpe_stream_fifo #(
77+
.DATA_WIDTH ( OupDataWidth ),
78+
.FIFO_DEPTH ( 2 ),
79+
.LATCH_FIFO ( 0 ),
80+
.LATCH_FIFO_TEST_WRAP ( 0 )
81+
) i_hwpe_stream_fifo (
82+
.clk_i ( clk_i ),
83+
.rst_ni ( rst_ni ),
84+
.clear_i ( clear_i ),
85+
.flags_o ( ),
86+
.push_i ( output_buffer ),
87+
.pop_o ( output_buffer_fifo )
88+
);
5189

5290
// Output slicing
5391
generate
5492
for (genvar r = 0; r < NumRows; r++) begin : gen_output_unpack
55-
assign output_o[r] = output_buffer[(OupDataWidth*(r+1))-1 -: OupDataWidth];
93+
assign output_o[r] = output_buffer_fifo.data[(CEOupDataWidth*(r+1))-1 -: CEOupDataWidth];
5694
end
5795
endgenerate
96+
assign valid_o = output_buffer_fifo.valid;
97+
assign output_buffer_fifo.ready = ready_i & enable_i;
5898

59-
// Handshake logic
60-
assign output_handshake = valid_o & ready_i;
6199

62-
// Ping-pong control
63-
assign ping_pong_status_d = clear_i ? 1'b0 :
64-
output_handshake ? ~ping_pong_status_q :
65-
ping_pong_status_q;
66-
always_ff @(posedge clk_i or negedge rst_ni) begin : gen_ping_pong_status_ff
67-
if (~rst_ni) begin
68-
ping_pong_status_q <= 1'b0;
69-
end else begin
70-
ping_pong_status_q <= ping_pong_status_d;
71-
end
72-
end
73100

74101
endmodule

0 commit comments

Comments
 (0)