@@ -243,7 +243,14 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
243243 // If the vslideup offset is greater than csr_vl_q, the vslideup has no effects
244244 logic null_vslideup;
245245 // Does the selected reg group for the selected EMUL have same EEW encoding?
246- logic is_same_eew;
246+ logic vs1_is_same_eew;
247+ logic vs2_is_same_eew;
248+ logic vd_is_same_eew;
249+ logic vs1_some_is_valid;
250+ logic vs2_some_is_valid;
251+ logic vd_some_is_valid;
252+ // EMUL for vs1, vs2, and vd. Useful for shuffling
253+ rvv_pkg :: vlmul_e emul_vs1, emul_vs2;
247254
248255 // Pipeline the VLSU's load and store complete signals, for timing reasons
249256 logic load_complete, load_complete_q;
@@ -442,6 +449,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
442449 is_config = 1'b0 ;
443450 ignore_zero_vl_check = 1'b0 ;
444451
452+ emul_vs1 = ara_req.emul;
453+ emul_vs2 = ara_req.emul;
454+
445455 // Saturation in any lane will raise vxsat flag
446456 csr_vxsat_d | = | vxsat_flag_i;
447457 // Fixed-point rounding mode is applied to all lanes
@@ -2967,6 +2977,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
29672977 // For memory operations: EMUL = LMUL * (EEW / SEW)
29682978 // EEW is encoded in the instruction
29692979 ara_req.emul = vlmul_e ' (csr_vtype_q.vlmul + (ara_req.vtype.vsew - csr_vtype_q.vsew));
2980+ emul_vs2 = vlmul_e ' (csr_vtype_q.vlmul + (ara_req.eew_vs2 - csr_vtype_q.vsew));
29702981
29712982 // Exception if EMUL > 8 or < 1/8
29722983 unique case ({ csr_vtype_q.vlmul[2 ], ara_req.emul[2 ]} )
@@ -3211,6 +3222,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
32113222 // For memory operations: EMUL = LMUL * (EEW / SEW)
32123223 // EEW is encoded in the instruction
32133224 ara_req.emul = vlmul_e ' (csr_vtype_q.vlmul + (ara_req.vtype.vsew - csr_vtype_q.vsew));
3225+ emul_vs2 = vlmul_e ' (csr_vtype_q.vlmul + (ara_req.eew_vs2 - csr_vtype_q.vsew));
32143226
32153227 // Exception if EMUL > 8 or < 1/8
32163228 unique case ({ csr_vtype_q.vlmul[2 ], ara_req.emul[2 ]} )
@@ -3334,6 +3346,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
33343346 end
33353347 end
33363348 ara_req.eew_vs1 = ara_req.vtype.vsew; // This is the new vs1 EEW
3349+ emul_vs1 = vlmul_e ' (csr_vtype_q.vlmul + (ara_req.eew_vs1 - csr_vtype_q.vsew));
33373350 end
33383351
33393352 // //////////////////////////
@@ -3611,11 +3624,10 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
36113624 // Optimization: reshuffle vs1 and vs2 only if the operation is strictly in-lane
36123625 // Optimization: reshuffle vd only if we are not overwriting the whole vector register!
36133626 // During a vstore, if vstart > 0, reshuffle immediately not to complicate operand fetch stage
3614- // During a vstore with EMUL > 1, reshuffle immediately if the register group's EEW is not the
3615- // same for every reg.
3616- reshuffle_req_d = { ara_req.use_vs1 && (ara_req.eew_vs1 != eew_q[ara_req.vs1]) && eew_valid_q[ara_req.vs1] && (in_lane_op || (is_vstore && ((csr_vstart_q != '0 ) || ! is_same_eew))),
3617- ara_req.use_vs2 && (ara_req.eew_vs2 != eew_q[ara_req.vs2]) && eew_valid_q[ara_req.vs2] && in_lane_op,
3618- ara_req.use_vd && (ara_req.vtype.vsew != eew_q[ara_req.vd ]) && eew_valid_q[ara_req.vd ] && ! (csr_vstart_q == 0 && (csr_vl_q == ((VLENB << ara_req.emul[1 : 0 ]) >> ara_req.vtype.vsew)))} ;
3627+ // If EMUL > 1, reshuffle if the register group's EEW is not the same for every reg.
3628+ reshuffle_req_d = { ara_req.use_vs1 && vs1_some_is_valid && ! (vs1_is_same_eew && ((is_vstore && (csr_vstart_q == '0 )) || (in_lane_op && (ara_req.eew_vs1 == eew_q[ara_req.vs1])))),
3629+ ara_req.use_vs2 && vs2_some_is_valid && ! (vs2_is_same_eew && in_lane_op && (ara_req.eew_vs2 == eew_q[ara_req.vs2])),
3630+ ara_req.use_vd && vd_some_is_valid && ! (vd_is_same_eew && (ara_req.vtype.vsew == eew_q[ara_req.vd])) && (csr_vl_q != ((VLENB << ara_req.emul[1 : 0 ]) >> ara_req.vtype.vsew))} ;
36193631 // Mask out requests if they refer to the same register!
36203632 reshuffle_req_d & = {
36213633 (insn.varith_type.rs1 != insn.varith_type.rs2) && (insn.varith_type.rs1 != insn.varith_type.rd),
@@ -3642,6 +3654,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
36423654 end
36433655 default : ;
36443656 endcase
3657+
3658+ // Mask the next request if we don't need to reshuffle the next reg
3659+ if (eew_new_buffer_d == eew_old_buffer_d) rs_mask_request_d = 1'b1 ;
36453660 end
36463661
36473662 // Reshuffle if at least one of the three registers needs a reshuffle
@@ -3746,7 +3761,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
37463761 logic [15 : 0 ] same_eew_m2;
37473762 logic [7 : 0 ] same_eew_m4;
37483763 logic [3 : 0 ] same_eew_m8;
3749- logic [3 : 0 ] same_eew_by_lmul;
3764+ logic [3 : 0 ] vs1_same_eew_by_lmul;
3765+ logic [3 : 0 ] vs2_same_eew_by_lmul;
3766+ logic [3 : 0 ] vd_same_eew_by_lmul;
37503767
37513768 // LMUL = 2: group of 2 registers
37523769 for (int i = 0 ; i < 16 ; i++ ) begin
@@ -3766,13 +3783,73 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #(
37663783 end
37673784
37683785 // Final selection per LMUL
3769- same_eew_by_lmul[LMUL_1 ] = 1'b1 ; // always same EEW with 1 register
3770- same_eew_by_lmul[LMUL_2 ] = same_eew_m2[ara_req.vs1[4 : 1 ]];
3771- same_eew_by_lmul[LMUL_4 ] = same_eew_m4[ara_req.vs1[4 : 2 ]];
3772- same_eew_by_lmul[LMUL_8 ] = same_eew_m8[ara_req.vs1[4 : 3 ]];
3786+ vs1_same_eew_by_lmul[LMUL_1 ] = 1'b1 ; // always same EEW with 1 register
3787+ vs1_same_eew_by_lmul[LMUL_2 ] = same_eew_m2[ara_req.vs1[4 : 1 ]];
3788+ vs1_same_eew_by_lmul[LMUL_4 ] = same_eew_m4[ara_req.vs1[4 : 2 ]];
3789+ vs1_same_eew_by_lmul[LMUL_8 ] = same_eew_m8[ara_req.vs1[4 : 3 ]];
3790+
3791+ vs2_same_eew_by_lmul[LMUL_1 ] = 1'b1 ; // always same EEW with 1 register
3792+ vs2_same_eew_by_lmul[LMUL_2 ] = same_eew_m2[ara_req.vs2[4 : 1 ]];
3793+ vs2_same_eew_by_lmul[LMUL_4 ] = same_eew_m4[ara_req.vs2[4 : 2 ]];
3794+ vs2_same_eew_by_lmul[LMUL_8 ] = same_eew_m8[ara_req.vs2[4 : 3 ]];
3795+
3796+ vd_same_eew_by_lmul[LMUL_1 ] = 1'b1 ; // always same EEW with 1 register
3797+ vd_same_eew_by_lmul[LMUL_2 ] = same_eew_m2[ara_req.vd[4 : 1 ]];
3798+ vd_same_eew_by_lmul[LMUL_4 ] = same_eew_m4[ara_req.vd[4 : 2 ]];
3799+ vd_same_eew_by_lmul[LMUL_8 ] = same_eew_m8[ara_req.vd[4 : 3 ]];
37733800
37743801 // If EMUL is fractional (emul[2] == 1), EEW is considered uniform
3775- is_same_eew = same_eew_by_lmul[ara_req.emul[1 : 0 ]] | ara_req.emul[2 ];
3802+ vs1_is_same_eew = vs1_same_eew_by_lmul[emul_vs1[1 : 0 ]] | emul_vs1[2 ];
3803+ vs2_is_same_eew = vs2_same_eew_by_lmul[emul_vs2[1 : 0 ]] | emul_vs2[2 ];
3804+ vd_is_same_eew = vd_same_eew_by_lmul[ara_req.emul[1 : 0 ]] | ara_req.emul[2 ];
3805+ end
3806+
3807+ // Check if the selected register groups have at least one register with valid data in it
3808+ always_comb begin
3809+ logic [15 : 0 ] some_valid_m2;
3810+ logic [7 : 0 ] some_valid_m4;
3811+ logic [3 : 0 ] some_valid_m8;
3812+ logic [3 : 0 ] vs1_some_is_valid_by_lmul;
3813+ logic [3 : 0 ] vs2_some_is_valid_by_lmul;
3814+ logic [3 : 0 ] vd_some_is_valid_by_lmul;
3815+
3816+ // LMUL = 2: group of 2 registers
3817+ for (int i = 0 ; i < 16 ; i++ ) begin
3818+ some_valid_m2[i] = (eew_valid_q[2 * i] | eew_valid_q[2 * i+ 1 ]);
3819+ end
3820+
3821+ // LMUL = 4: group of 4 registers (2 LMUL=2 groups + mid-pair check)
3822+ for (int i = 0 ; i < 8 ; i++ ) begin
3823+ some_valid_m4[i] = (eew_valid_q[4 * i+ 1 ] | eew_valid_q[4 * i+ 2 ]) ||
3824+ (some_valid_m2[2 * i] | some_valid_m2[2 * i+ 1 ]);
3825+ end
3826+
3827+ // LMUL = 8: group of 8 registers (2 LMUL=4 groups + mid-pair check)
3828+ for (int i = 0 ; i < 4 ; i++ ) begin
3829+ some_valid_m8[i] = (eew_valid_q[8 * i+ 3 ] | eew_valid_q[8 * i+ 4 ]) ||
3830+ (some_valid_m4[2 * i] | some_valid_m4[2 * i+ 1 ]);
3831+ end
3832+
3833+ // Final selection per LMUL
3834+ vs1_some_is_valid_by_lmul[LMUL_1 ] = eew_valid_q[ara_req.vs1];
3835+ vs1_some_is_valid_by_lmul[LMUL_2 ] = some_valid_m2[ara_req.vs1[4 : 1 ]];
3836+ vs1_some_is_valid_by_lmul[LMUL_4 ] = some_valid_m4[ara_req.vs1[4 : 2 ]];
3837+ vs1_some_is_valid_by_lmul[LMUL_8 ] = some_valid_m8[ara_req.vs1[4 : 3 ]];
3838+
3839+ vs2_some_is_valid_by_lmul[LMUL_1 ] = eew_valid_q[ara_req.vs2];
3840+ vs2_some_is_valid_by_lmul[LMUL_2 ] = some_valid_m2[ara_req.vs2[4 : 1 ]];
3841+ vs2_some_is_valid_by_lmul[LMUL_4 ] = some_valid_m4[ara_req.vs2[4 : 2 ]];
3842+ vs2_some_is_valid_by_lmul[LMUL_8 ] = some_valid_m8[ara_req.vs2[4 : 3 ]];
3843+
3844+ vd_some_is_valid_by_lmul[LMUL_1 ] = eew_valid_q[ara_req.vd];
3845+ vd_some_is_valid_by_lmul[LMUL_2 ] = some_valid_m2[ara_req.vd[4 : 1 ]];
3846+ vd_some_is_valid_by_lmul[LMUL_4 ] = some_valid_m4[ara_req.vd[4 : 2 ]];
3847+ vd_some_is_valid_by_lmul[LMUL_8 ] = some_valid_m8[ara_req.vd[4 : 3 ]];
3848+
3849+ // If EMUL is fractional (emul[2] == 1), EEW is considered one
3850+ vs1_some_is_valid = vs1_some_is_valid_by_lmul[emul_vs1[1 : 0 ] & { 2 {~ emul_vs1[2 ]}} ];
3851+ vs2_some_is_valid = vs2_some_is_valid_by_lmul[emul_vs2[1 : 0 ] & { 2 {~ emul_vs2[2 ]}} ];
3852+ vd_some_is_valid = vd_some_is_valid_by_lmul[ara_req.emul[1 : 0 ] & { 2 {~ ara_req.emul[2 ]}} ];
37763853 end
37773854
37783855endmodule : ara_dispatcher
0 commit comments