Skip to content

Commit 5af9d60

Browse files
committed
[hardware] 🐛 Fix reshuffle with LMUL > 1
1 parent fa9cd88 commit 5af9d60

File tree

1 file changed

+89
-12
lines changed

1 file changed

+89
-12
lines changed

hardware/src/ara_dispatcher.sv

Lines changed: 89 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
243243
// If the vslideup offset is greater than csr_vl_q, the vslideup has no effects
244244
logic null_vslideup;
245245
// Does the selected reg group for the selected EMUL have same EEW encoding?
246-
logic is_same_eew;
246+
logic vs1_is_same_eew;
247+
logic vs2_is_same_eew;
248+
logic vd_is_same_eew;
249+
logic vs1_some_is_valid;
250+
logic vs2_some_is_valid;
251+
logic vd_some_is_valid;
252+
// EMUL for vs1, vs2, and vd. Useful for shuffling
253+
rvv_pkg::vlmul_e emul_vs1, emul_vs2;
247254

248255
// Pipeline the VLSU's load and store complete signals, for timing reasons
249256
logic load_complete, load_complete_q;
@@ -442,6 +449,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
442449
is_config = 1'b0;
443450
ignore_zero_vl_check = 1'b0;
444451

452+
emul_vs1 = ara_req.emul;
453+
emul_vs2 = ara_req.emul;
454+
445455
// Saturation in any lane will raise vxsat flag
446456
csr_vxsat_d |= |vxsat_flag_i;
447457
// Fixed-point rounding mode is applied to all lanes
@@ -2967,6 +2977,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
29672977
// For memory operations: EMUL = LMUL * (EEW / SEW)
29682978
// EEW is encoded in the instruction
29692979
ara_req.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req.vtype.vsew - csr_vtype_q.vsew));
2980+
emul_vs2 = vlmul_e'(csr_vtype_q.vlmul + (ara_req.eew_vs2 - csr_vtype_q.vsew));
29702981

29712982
// Exception if EMUL > 8 or < 1/8
29722983
unique case ({csr_vtype_q.vlmul[2], ara_req.emul[2]})
@@ -3211,6 +3222,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
32113222
// For memory operations: EMUL = LMUL * (EEW / SEW)
32123223
// EEW is encoded in the instruction
32133224
ara_req.emul = vlmul_e'(csr_vtype_q.vlmul + (ara_req.vtype.vsew - csr_vtype_q.vsew));
3225+
emul_vs2 = vlmul_e'(csr_vtype_q.vlmul + (ara_req.eew_vs2 - csr_vtype_q.vsew));
32143226

32153227
// Exception if EMUL > 8 or < 1/8
32163228
unique case ({csr_vtype_q.vlmul[2], ara_req.emul[2]})
@@ -3334,6 +3346,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
33343346
end
33353347
end
33363348
ara_req.eew_vs1 = ara_req.vtype.vsew; // This is the new vs1 EEW
3349+
emul_vs1 = vlmul_e'(csr_vtype_q.vlmul + (ara_req.eew_vs1 - csr_vtype_q.vsew));
33373350
end
33383351

33393352
////////////////////////////
@@ -3611,11 +3624,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
36113624
// Optimization: reshuffle vs1 and vs2 only if the operation is strictly in-lane
36123625
// Optimization: reshuffle vd only if we are not overwriting the whole vector register!
36133626
// During a vstore, if vstart > 0, reshuffle immediately not to complicate operand fetch stage
3614-
// During a vstore with EMUL > 1, reshuffle immediately if the register group's EEW is not the
3615-
// same for every reg.
3616-
reshuffle_req_d = {ara_req.use_vs1 && (ara_req.eew_vs1 != eew_q[ara_req.vs1]) && eew_valid_q[ara_req.vs1] && (in_lane_op || (is_vstore && ((csr_vstart_q != '0) || !is_same_eew))),
3617-
ara_req.use_vs2 && (ara_req.eew_vs2 != eew_q[ara_req.vs2]) && eew_valid_q[ara_req.vs2] && in_lane_op,
3618-
ara_req.use_vd && (ara_req.vtype.vsew != eew_q[ara_req.vd ]) && eew_valid_q[ara_req.vd ] && !(csr_vstart_q == 0 && (csr_vl_q == ((VLENB << ara_req.emul[1:0]) >> ara_req.vtype.vsew)))};
3627+
// If EMUL > 1, reshuffle if the register group's EEW is not the same for every reg.
3628+
reshuffle_req_d = {ara_req.use_vs1 && vs1_some_is_valid && !(vs1_is_same_eew && ((is_vstore && (csr_vstart_q == '0)) || (in_lane_op && (ara_req.eew_vs1 == eew_q[ara_req.vs1])))),
3629+
ara_req.use_vs2 && vs2_some_is_valid && !(vs2_is_same_eew && in_lane_op && (ara_req.eew_vs2 == eew_q[ara_req.vs2])),
3630+
ara_req.use_vd && vd_some_is_valid && !(vd_is_same_eew && (ara_req.vtype.vsew == eew_q[ara_req.vd])) && (csr_vl_q != ((VLENB << ara_req.emul[1:0]) >> ara_req.vtype.vsew))};
36193631
// Mask out requests if they refer to the same register!
36203632
reshuffle_req_d &= {
36213633
(insn.varith_type.rs1 != insn.varith_type.rs2) && (insn.varith_type.rs1 != insn.varith_type.rd),
@@ -3642,6 +3654,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
36423654
end
36433655
default:;
36443656
endcase
3657+
3658+
// Mask the next request if we don't need to reshuffle the next reg
3659+
if (eew_new_buffer_d == eew_old_buffer_d) rs_mask_request_d = 1'b1;
36453660
end
36463661

36473662
// Reshuffle if at least one of the three registers needs a reshuffle
@@ -3746,7 +3761,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
37463761
logic [15:0] same_eew_m2;
37473762
logic [7:0] same_eew_m4;
37483763
logic [3:0] same_eew_m8;
3749-
logic [3:0] same_eew_by_lmul;
3764+
logic [3:0] vs1_same_eew_by_lmul;
3765+
logic [3:0] vs2_same_eew_by_lmul;
3766+
logic [3:0] vd_same_eew_by_lmul;
37503767

37513768
// LMUL = 2: group of 2 registers
37523769
for (int i = 0; i < 16; i++) begin
@@ -3766,13 +3783,73 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
37663783
end
37673784

37683785
// Final selection per LMUL
3769-
same_eew_by_lmul[LMUL_1] = 1'b1; // always same EEW with 1 register
3770-
same_eew_by_lmul[LMUL_2] = same_eew_m2[ara_req.vs1[4:1]];
3771-
same_eew_by_lmul[LMUL_4] = same_eew_m4[ara_req.vs1[4:2]];
3772-
same_eew_by_lmul[LMUL_8] = same_eew_m8[ara_req.vs1[4:3]];
3786+
vs1_same_eew_by_lmul[LMUL_1] = 1'b1; // always same EEW with 1 register
3787+
vs1_same_eew_by_lmul[LMUL_2] = same_eew_m2[ara_req.vs1[4:1]];
3788+
vs1_same_eew_by_lmul[LMUL_4] = same_eew_m4[ara_req.vs1[4:2]];
3789+
vs1_same_eew_by_lmul[LMUL_8] = same_eew_m8[ara_req.vs1[4:3]];
3790+
3791+
vs2_same_eew_by_lmul[LMUL_1] = 1'b1; // always same EEW with 1 register
3792+
vs2_same_eew_by_lmul[LMUL_2] = same_eew_m2[ara_req.vs2[4:1]];
3793+
vs2_same_eew_by_lmul[LMUL_4] = same_eew_m4[ara_req.vs2[4:2]];
3794+
vs2_same_eew_by_lmul[LMUL_8] = same_eew_m8[ara_req.vs2[4:3]];
3795+
3796+
vd_same_eew_by_lmul[LMUL_1] = 1'b1; // always same EEW with 1 register
3797+
vd_same_eew_by_lmul[LMUL_2] = same_eew_m2[ara_req.vd[4:1]];
3798+
vd_same_eew_by_lmul[LMUL_4] = same_eew_m4[ara_req.vd[4:2]];
3799+
vd_same_eew_by_lmul[LMUL_8] = same_eew_m8[ara_req.vd[4:3]];
37733800

37743801
// If EMUL is fractional (emul[2] == 1), EEW is considered uniform
3775-
is_same_eew = same_eew_by_lmul[ara_req.emul[1:0]] | ara_req.emul[2];
3802+
vs1_is_same_eew = vs1_same_eew_by_lmul[emul_vs1[1:0]] | emul_vs1[2];
3803+
vs2_is_same_eew = vs2_same_eew_by_lmul[emul_vs2[1:0]] | emul_vs2[2];
3804+
vd_is_same_eew = vd_same_eew_by_lmul[ara_req.emul[1:0]] | ara_req.emul[2];
3805+
end
3806+
3807+
// Check if the selected register groups have at least one register with valid data in it
3808+
always_comb begin
3809+
logic [15:0] some_valid_m2;
3810+
logic [7:0] some_valid_m4;
3811+
logic [3:0] some_valid_m8;
3812+
logic [3:0] vs1_some_is_valid_by_lmul;
3813+
logic [3:0] vs2_some_is_valid_by_lmul;
3814+
logic [3:0] vd_some_is_valid_by_lmul;
3815+
3816+
// LMUL = 2: group of 2 registers
3817+
for (int i = 0; i < 16; i++) begin
3818+
some_valid_m2[i] = (eew_valid_q[2*i] | eew_valid_q[2*i+1]);
3819+
end
3820+
3821+
// LMUL = 4: group of 4 registers (2 LMUL=2 groups + mid-pair check)
3822+
for (int i = 0; i < 8; i++) begin
3823+
some_valid_m4[i] = (eew_valid_q[4*i+1] | eew_valid_q[4*i+2]) ||
3824+
(some_valid_m2[2*i] | some_valid_m2[2*i+1]);
3825+
end
3826+
3827+
// LMUL = 8: group of 8 registers (2 LMUL=4 groups + mid-pair check)
3828+
for (int i = 0; i < 4; i++) begin
3829+
some_valid_m8[i] = (eew_valid_q[8*i+3] | eew_valid_q[8*i+4]) ||
3830+
(some_valid_m4[2*i] | some_valid_m4[2*i+1]);
3831+
end
3832+
3833+
// Final selection per LMUL
3834+
vs1_some_is_valid_by_lmul[LMUL_1] = eew_valid_q[ara_req.vs1];
3835+
vs1_some_is_valid_by_lmul[LMUL_2] = some_valid_m2[ara_req.vs1[4:1]];
3836+
vs1_some_is_valid_by_lmul[LMUL_4] = some_valid_m4[ara_req.vs1[4:2]];
3837+
vs1_some_is_valid_by_lmul[LMUL_8] = some_valid_m8[ara_req.vs1[4:3]];
3838+
3839+
vs2_some_is_valid_by_lmul[LMUL_1] = eew_valid_q[ara_req.vs2];
3840+
vs2_some_is_valid_by_lmul[LMUL_2] = some_valid_m2[ara_req.vs2[4:1]];
3841+
vs2_some_is_valid_by_lmul[LMUL_4] = some_valid_m4[ara_req.vs2[4:2]];
3842+
vs2_some_is_valid_by_lmul[LMUL_8] = some_valid_m8[ara_req.vs2[4:3]];
3843+
3844+
vd_some_is_valid_by_lmul[LMUL_1] = eew_valid_q[ara_req.vd];
3845+
vd_some_is_valid_by_lmul[LMUL_2] = some_valid_m2[ara_req.vd[4:1]];
3846+
vd_some_is_valid_by_lmul[LMUL_4] = some_valid_m4[ara_req.vd[4:2]];
3847+
vd_some_is_valid_by_lmul[LMUL_8] = some_valid_m8[ara_req.vd[4:3]];
3848+
3849+
// If EMUL is fractional (emul[2] == 1), EEW is considered one
3850+
vs1_some_is_valid = vs1_some_is_valid_by_lmul[emul_vs1[1:0] & {2{~emul_vs1[2]}}];
3851+
vs2_some_is_valid = vs2_some_is_valid_by_lmul[emul_vs2[1:0] & {2{~emul_vs2[2]}}];
3852+
vd_some_is_valid = vd_some_is_valid_by_lmul[ara_req.emul[1:0] & {2{~ara_req.emul[2]}}];
37763853
end
37773854

37783855
endmodule : ara_dispatcher

0 commit comments

Comments
 (0)