Skip to content

Commit ecc2fd2

Browse files
rtl/compute_unit: multi-dispatch dispatcher and opc stage
1 parent ac41efd commit ecc2fd2

File tree

6 files changed

+518
-339
lines changed

6 files changed

+518
-339
lines changed

rtl/compute_unit/compute_unit.sv

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -555,15 +555,15 @@ module compute_unit import bgpu_pkg::*; #(
555555
.clk_i ( clk_i ),
556556
.rst_ni( rst_ni ),
557557

558-
.opc_ready_o ( opc_to_disp_ready ),
559-
.disp_valid_i ( disp_to_opc_valid ),
560-
.disp_tag_i ( disp_to_opc_data.tag ),
561-
.disp_pc_i ( disp_to_opc_data.pc ),
562-
.disp_act_mask_i ( disp_to_opc_data.act_mask ),
563-
.disp_inst_i ( disp_to_opc_data.inst ),
564-
.disp_dst_i ( disp_to_opc_data.dst ),
565-
.disp_src_required_i( disp_to_opc_data.operands_is_reg ),
566-
.disp_src_i ( disp_to_opc_data.operands ),
558+
.opc_ready_o ( opc_to_disp_ready ),
559+
.disp_valid_i ( disp_to_opc_valid ),
560+
.disp_tag_i ( disp_to_opc_data.tag ),
561+
.disp_pc_i ( disp_to_opc_data.pc ),
562+
.disp_act_mask_i ( disp_to_opc_data.act_mask ),
563+
.disp_inst_i ( disp_to_opc_data.inst ),
564+
.disp_dst_i ( disp_to_opc_data.dst ),
565+
.disp_src_is_reg_i ( disp_to_opc_data.operands_is_reg ),
566+
.disp_src_i ( disp_to_opc_data.operands ),
567567

568568
.eu_ready_i ( eu_to_opc_ready_q ),
569569
.opc_valid_o ( opc_to_eu_valid_d ),

rtl/compute_unit/dispatcher/multi_warp_dispatcher.sv

Lines changed: 103 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
module multi_warp_dispatcher import bgpu_pkg::*; #(
88
/// Number of instructions to fetch for the warp
99
parameter int unsigned FetchWidth = 1,
10+
/// Number of instructions to dispatch simultaneously
11+
// Each warp dispatches atmost one instruction per cycle -> saves complexity in dispatcher
12+
// but multiple warps can dispatch simultaneously
13+
parameter int unsigned DispatchWidth = 1,
1014
/// Number of instructions that can write back simultaneously
1115
parameter int unsigned WritebackWidth = 1,
1216
/// Number of inflight instructions per warp
@@ -73,19 +77,19 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
7377
input op_reg_idx_t [FetchWidth-1:0] dec_operands_i,
7478

7579
/// To Operand Collector
76-
input logic opc_ready_i,
77-
output logic disp_valid_o,
78-
output iid_t disp_tag_o,
79-
output pc_t disp_pc_o,
80-
output act_mask_t disp_act_mask_o,
81-
output inst_t disp_inst_o,
82-
output reg_idx_t disp_dst_o,
83-
output op_is_reg_t disp_operands_is_reg_o,
84-
output op_reg_idx_t disp_operands_o,
80+
input logic [DispatchWidth-1:0] opc_ready_i,
81+
output logic [DispatchWidth-1:0] disp_valid_o,
82+
output iid_t [DispatchWidth-1:0] disp_tag_o,
83+
output pc_t [DispatchWidth-1:0] disp_pc_o,
84+
output act_mask_t [DispatchWidth-1:0] disp_act_mask_o,
85+
output inst_t [DispatchWidth-1:0] disp_inst_o,
86+
output reg_idx_t [DispatchWidth-1:0] disp_dst_o,
87+
output op_is_reg_t [DispatchWidth-1:0] disp_operands_is_reg_o,
88+
output op_reg_idx_t [DispatchWidth-1:0] disp_operands_o,
8589

8690
/// From Operand Collector -> instruction has read its operands
87-
input logic opc_eu_handshake_i,
88-
input iid_t opc_eu_tag_i,
91+
input logic [DispatchWidth-1:0] opc_eu_handshake_i,
92+
input iid_t [DispatchWidth-1:0] opc_eu_tag_i,
8993

9094
/// From Execution Units
9195
input logic [WritebackWidth-1:0] eu_valid_i,
@@ -120,18 +124,20 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
120124
tag_t [WritebackWidth-1:0] eu_tag;
121125

122126
// Round Robin Arbiter
123-
warp_mask_t arb_gnt;
124-
warp_mask_t rr_inst_ready;
127+
warp_mask_t arb_gnts;
128+
warp_mask_t [DispatchWidth-1:0] arb_gnt;
129+
warp_mask_t [DispatchWidth-1:0] rr_inst_ready;
125130

126-
wid_t arb_sel_wid;
131+
wid_t [DispatchWidth-1:0] arb_sel_wid;
132+
disp_data_t [DispatchWidth-1:0] arb_sel_data;
127133
disp_data_t [NumWarps-1:0] arb_in_data;
128-
disp_data_t arb_sel_data;
129134

130135
// Decoded Demultiplexer
131136
fetch_mask_t [NumWarps-1:0] dec_decoded_unused_ibe;
132137

133138
// OPC EU Handshake Demultiplexer
134139
warp_mask_t opc_eu_handshake_warp;
140+
tag_t [NumWarps-1:0] opc_eu_tag;
135141

136142
// #######################################################################################
137143
// # Dispatcher per warp #
@@ -175,14 +181,26 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
175181
// OPC EU Handshake Demultiplexer
176182
always_comb begin
177183
opc_eu_handshake_warp = '0;
178-
opc_eu_handshake_warp[opc_eu_tag_i[WidWidth-1:0]] = opc_eu_handshake_i;
184+
opc_eu_tag = '0;
185+
for (int didx = 0; didx < DispatchWidth; didx++) begin
186+
opc_eu_handshake_warp[opc_eu_tag_i[didx][WidWidth-1:0]] = opc_eu_handshake_i[didx];
187+
opc_eu_tag[opc_eu_tag_i[didx][WidWidth-1:0]] = opc_eu_tag_i[didx][WidWidth+:TagWidth];
188+
end
179189
end
180190

181191
// Extract EU Tags
182192
for (genvar wb = 0; wb < WritebackWidth; wb++) begin : gen_eu_tags
183193
assign eu_tag[wb] = eu_tag_i[wb][WidWidth+:TagWidth];
184194
end : gen_eu_tags
185195

196+
// Combine all arbiter grants
197+
always_comb begin
198+
arb_gnts = '0;
199+
for (int didx = 0; didx < DispatchWidth; didx++) begin
200+
arb_gnts |= arb_gnt[didx];
201+
end
202+
end
203+
186204
// Dispatcher per Warp
187205
for (genvar warp = 0; warp < NumWarps; warp++) begin : gen_dispatcher
188206
dispatcher #(
@@ -216,8 +234,8 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
216234
.dec_operands_is_reg_i( dec_operands_is_reg_i ),
217235
.dec_operands_i ( dec_operands_i ),
218236

219-
.opc_ready_i ( arb_gnt [warp] ),
220-
.disp_valid_o ( rr_inst_ready[warp] ),
237+
.opc_ready_i ( arb_gnts [warp] ),
238+
.disp_valid_o ( rr_inst_ready[0][warp] ),
221239
.disp_pc_o ( arb_in_data [warp].pc ),
222240
.disp_act_mask_o ( arb_in_data [warp].act_mask ),
223241
.disp_tag_o ( arb_in_data [warp].tag ),
@@ -226,8 +244,8 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
226244
.disp_operands_is_reg_o( arb_in_data [warp].operands_is_reg ),
227245
.disp_operands_o ( arb_in_data [warp].operands ),
228246

229-
.opc_eu_handshake_i( opc_eu_handshake_warp[warp] ),
230-
.opc_eu_tag_i ( opc_eu_tag_i[WidWidth+:TagWidth] ),
247+
.opc_eu_handshake_i( opc_eu_handshake_warp[warp] ),
248+
.opc_eu_tag_i ( opc_eu_tag [warp] ),
231249

232250
.eu_valid_i( eu_valid[warp] ),
233251
.eu_tag_i ( eu_tag )
@@ -238,38 +256,70 @@ module multi_warp_dispatcher import bgpu_pkg::*; #(
238256
// # Round Robin Arbiter #
239257
// #######################################################################################
240258

241-
rr_arb_tree #(
242-
.DataType ( disp_data_t ),
243-
.NumIn ( NumWarps ),
244-
.ExtPrio ( 1'b0 ),
245-
.AxiVldRdy( 1'b0 ),
246-
.LockIn ( 1'b0 ),
247-
.FairArb ( 1'b1 )
248-
) i_rr_arb (
249-
.clk_i ( clk_i ),
250-
.rst_ni( rst_ni ),
251-
252-
.req_i ( rr_inst_ready ),
253-
.gnt_o ( arb_gnt ),
254-
.data_i ( arb_in_data ),
255-
256-
// Directly to Operand Collector
257-
.req_o ( disp_valid_o ),
258-
.gnt_i ( opc_ready_i ),
259-
.data_o( arb_sel_data ),
260-
.idx_o ( arb_sel_wid ),
261-
262-
// Unused
263-
.flush_i( 1'b0 ),
264-
.rr_i ( '0 )
265-
);
266-
267-
assign disp_tag_o = {arb_sel_data.tag, arb_sel_wid};
268-
assign disp_pc_o = arb_sel_data.pc;
269-
assign disp_act_mask_o = arb_sel_data.act_mask;
270-
assign disp_inst_o = arb_sel_data.inst;
271-
assign disp_dst_o = arb_sel_data.dst_reg;
272-
assign disp_operands_is_reg_o = arb_sel_data.operands_is_reg;
273-
assign disp_operands_o = arb_sel_data.operands;
259+
for (genvar didx = 0; didx < DispatchWidth; didx++) begin : gen_rr_arb
260+
if (didx > 0) begin : gen_upper_rr_inst_ready
261+
assign rr_inst_ready[didx] = rr_inst_ready[didx-1] & (~arb_gnt[didx-1]);
262+
end : gen_upper_rr_inst_ready
263+
264+
rr_arb_tree #(
265+
.DataType ( disp_data_t ),
266+
.NumIn ( NumWarps ),
267+
.ExtPrio ( 1'b0 ),
268+
.AxiVldRdy( 1'b0 ),
269+
.LockIn ( 1'b0 ),
270+
.FairArb ( 1'b1 )
271+
) i_rr_arb (
272+
.clk_i ( clk_i ),
273+
.rst_ni( rst_ni ),
274+
275+
.req_i ( rr_inst_ready[didx] ),
276+
.gnt_o ( arb_gnt [didx] ),
277+
.data_i ( arb_in_data ),
278+
279+
// Directly to Operand Collector
280+
.req_o ( disp_valid_o[didx] ),
281+
.gnt_i ( opc_ready_i [didx] ),
282+
.data_o( arb_sel_data[didx] ),
283+
.idx_o ( arb_sel_wid [didx] ),
284+
285+
// Unused
286+
.flush_i( 1'b0 ),
287+
.rr_i ( '0 )
288+
);
289+
290+
assign disp_tag_o [didx] = {arb_sel_data[didx].tag, arb_sel_wid[didx]};
291+
assign disp_pc_o [didx] = arb_sel_data[didx].pc;
292+
assign disp_act_mask_o [didx] = arb_sel_data[didx].act_mask;
293+
assign disp_inst_o [didx] = arb_sel_data[didx].inst;
294+
assign disp_dst_o [didx] = arb_sel_data[didx].dst_reg;
295+
assign disp_operands_is_reg_o[didx] = arb_sel_data[didx].operands_is_reg;
296+
assign disp_operands_o [didx] = arb_sel_data[didx].operands;
297+
end : gen_rr_arb
298+
299+
// #######################################################################################
300+
// # Assertions #
301+
// #######################################################################################
302+
303+
`ifndef SYNTHESIS
304+
for (genvar didx = 0; didx < DispatchWidth; didx++) begin : gen_out_asserts
305+
for (genvar other_didx = 0; other_didx < DispatchWidth; other_didx++)
306+
begin : gen_out_asserts_inner
307+
if (didx != other_didx) begin : gen_diff_didx
308+
// Check for OPC EU Handshake for the same warp received on multiple dispatch outputs
309+
assert property (@(posedge clk_i) disable iff (!rst_ni)
310+
(opc_eu_handshake_i[didx] && opc_eu_handshake_i[other_didx]
311+
-> opc_eu_tag_i[didx][WidWidth-1:0]
312+
!= opc_eu_tag_i[other_didx][WidWidth-1:0]))
313+
else $error("OPC EU Handshake for the same warp received!");
314+
315+
// Check that no two dispatch outputs dispatch to the same warp in the same cycle
316+
assert property (@(posedge clk_i) disable iff (!rst_ni)
317+
(disp_valid_o[didx] && disp_valid_o[other_didx]
318+
-> arb_gnt[didx] != arb_gnt[other_didx]))
319+
else $error("Two outputs dispatching to the same warp in the same cycle!");
320+
end : gen_diff_didx
321+
end : gen_out_asserts_inner
322+
end : gen_out_asserts
323+
`endif // SYNTHESIS
274324

275325
endmodule : multi_warp_dispatcher

rtl/compute_unit/register_opc/operand_collector.sv

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,31 @@ module operand_collector import bgpu_pkg::*; #(
2222
parameter int unsigned RegWidth = 32,
2323

2424
/// Dependent parameter, do **not** overwrite.
25-
parameter int unsigned TagWidth = $clog2(NumTags),
26-
parameter int unsigned WidWidth = NumWarps > 1 ? $clog2(NumWarps) : 1,
27-
parameter type wid_t = logic [ WidWidth-1:0],
28-
parameter type reg_idx_t = logic [ RegIdxWidth-1:0],
29-
parameter type pc_t = logic [ PcWidth-1:0],
30-
parameter type act_mask_t = logic [ WarpWidth-1:0],
31-
parameter type warp_data_t = logic [RegWidth * WarpWidth-1:0],
32-
parameter type iid_t = logic [ TagWidth+WidWidth-1:0]
25+
parameter int unsigned TagWidth = $clog2(NumTags),
26+
parameter int unsigned WidWidth = NumWarps > 1 ? $clog2(NumWarps) : 1,
27+
parameter type wid_t = logic [ WidWidth-1:0],
28+
parameter type reg_idx_t = logic [ RegIdxWidth-1:0],
29+
parameter type pc_t = logic [ PcWidth-1:0],
30+
parameter type act_mask_t = logic [ WarpWidth-1:0],
31+
parameter type warp_data_t = logic [RegWidth * WarpWidth-1:0],
32+
parameter type iid_t = logic [ TagWidth+WidWidth-1:0],
33+
parameter type op_is_reg_t = logic [ OperandsPerInst-1:0],
34+
parameter type op_reg_idx_t = reg_idx_t [ OperandsPerInst-1:0]
3335
) (
3436
// Clock and Reset
3537
input logic clk_i,
3638
input logic rst_ni,
3739

3840
/// From Multi Warp Dispatcher
39-
output logic opc_ready_o,
40-
input logic disp_valid_i,
41-
input iid_t disp_tag_i,
42-
input pc_t disp_pc_i,
43-
input act_mask_t disp_act_mask_i,
44-
input inst_t disp_inst_i,
45-
input reg_idx_t disp_dst_i,
46-
input logic [OperandsPerInst-1:0] disp_src_required_i,
47-
input reg_idx_t [OperandsPerInst-1:0] disp_src_i,
41+
output logic opc_ready_o,
42+
input logic disp_valid_i,
43+
input iid_t disp_tag_i,
44+
input pc_t disp_pc_i,
45+
input act_mask_t disp_act_mask_i,
46+
input inst_t disp_inst_i,
47+
input reg_idx_t disp_dst_i,
48+
input op_is_reg_t disp_src_is_reg_i,
49+
input op_reg_idx_t disp_src_i,
4850

4951
/// To Register File
5052
output logic [OperandsPerInst-1:0] opc_read_req_valid_o,
@@ -132,9 +134,9 @@ module operand_collector import bgpu_pkg::*; #(
132134
// Insert new instruction |-> Handshake
133135
if (disp_valid_i && opc_ready_o) begin : new_instruction
134136
// If we do not require the operand, we are ready and have already requested it
135-
operand_d[i].requested = !disp_src_required_i[i];
136-
operand_d[i].ready = !disp_src_required_i[i];
137-
if (disp_src_required_i[i])
137+
operand_d[i].requested = !disp_src_is_reg_i[i];
138+
operand_d[i].ready = !disp_src_is_reg_i[i];
139+
if (disp_src_is_reg_i[i])
138140
operand_d[i].reg_idx = disp_src_i[i];
139141
else begin : operands_not_required
140142
// Store register index of operands in the data

0 commit comments

Comments
 (0)