vortexgpgpu
diff --git a/‎hw/rtl/libs/VX_csa_half_en.sv‎
Lines changed: 0 additions & 186 deletions b/‎hw/rtl/libs/VX_csa_half_en.sv‎
Lines changed: 0 additions & 186 deletions
diff --git a/‎hw/rtl/tcu/drl/VX_tcu_drl_acc.sv‎
Lines changed: 34 additions & 13 deletions b/‎hw/rtl/tcu/drl/VX_tcu_drl_acc.sv‎
Lines changed: 34 additions & 13 deletions
diff --git a/‎hw/rtl/tcu/drl/VX_tcu_drl_align.sv‎
Lines changed: 13 additions & 7 deletions b/‎hw/rtl/tcu/drl/VX_tcu_drl_align.sv‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎hw/rtl/tcu/drl/VX_tcu_drl_exp_bias.sv‎
Lines changed: 14 additions & 9 deletions b/‎hw/rtl/tcu/drl/VX_tcu_drl_exp_bias.sv‎
Lines changed: 14 additions & 9 deletions
@@ -7,7 +7,7 @@
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
-// WAITHOUT WAARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WAARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
@@ -20,27 +20,48 @@ module VX_tcu_drl_acc #(
 ) (
     input  wire [N-1:0][W-1:0] sigsIn,
     input  wire fmt_sel,
+    input  wire [N-2:0] sparse_mask,
     output logic [WA-1:0] sigOut,
     output logic [N-2:0] signOuts
 );
-    // Sign-extend fp significands to WA bits (header)
+
+    //input power gating
+    wire [N-1:0][W-1:0] gated_sigsIn;
+    for (genvar i = 0; i < N-1; i++) begin : g_power_gating
+        assign gated_sigsIn[i] = ({W{sparse_mask[i]}} & sigsIn[i]);
+    end
+    assign gated_sigsIn[N-1] = sigsIn[N-1];  //c_val
+
+    //Sign-extend fp significands to WA bits (header)
     wire [N-1:0][WA-1:0] sigsIn_ext;
     for (genvar i = 0; i < N; i++) begin : g_ext_sign
-        assign sigsIn_ext[i] = fmt_sel ? {{(WA-W){1'b0}}, sigsIn[i]} : {{(WA-W){sigsIn[i][W-1]}}, sigsIn[i]};
+        assign sigsIn_ext[i] = fmt_sel ? {{(WA-W){1'b0}}, gated_sigsIn[i]} : {{(WA-W){gated_sigsIn[i][W-1]}}, gated_sigsIn[i]};
     end
 
     //Carry-Save-Adder based significand accumulation
-    VX_csa_half_en #(
-        .N (N),
-        .W (WA),
-        .S (WA-1)
-    ) sig_csa (
-        .operands (sigsIn_ext),
-        .half_en (1'b1),    // TODO: feed sparsity control signal when resolved
-        .sum  (sigOut[WA-2:0]),
-        .cout (sigOut[WA-1])
-    );
+    if (N >= 7) begin : g_large_acc
+        VX_csa_mod4 #(
+            .N (N),
+            .W (WA),
+            .S (WA-1)
+        ) sig_csa (
+            .operands (sigsIn_ext),
+            .sum      (sigOut[WA-2:0]),
+            .cout     (sigOut[WA-1])
+        );
+    end else begin : g_small_acc
+        VX_csa_tree #(
+            .N (N),
+            .W (WA),
+            .S (WA-1)
+        ) sig_csa (
+            .operands (sigsIn_ext),
+            .sum      (sigOut[WA-2:0]),
+            .cout     (sigOut[WA-1])
+        );
+    end
 
+    //Extract prod sigs signs for INT
     for (genvar i = 0; i < N-1; i++) begin : g_signs
         assign signOuts[i] = sigsIn[i][W-1];
     end
 
@@ -20,22 +20,28 @@ module VX_tcu_drl_align #(
     input wire [N-1:0][7:0] shift_amounts,
     input wire [N-1:0][24:0] sigs_in,
     input wire fmt_sel,
+    input wire [N-2:0] sparse_mask,
     output logic [N-1:0][W-1:0] sigs_out
 );
 
+    //input power gating
+    wire [N-1:0][7:0] gated_shift_amounts;
+    wire [N-1:0][24:0] gated_sigs_in;
+    for (genvar i = 0; i < N-1; i++) begin : g_power_gating
+        assign gated_sigs_in[i] = ({25{sparse_mask[i]}} & sigs_in[i]);
+        assign gated_shift_amounts[i] = ({8{sparse_mask[i]}} & shift_amounts[i]);
+    end
+    assign gated_sigs_in[N-1] = sigs_in[N-1];               //c_val
+    assign gated_shift_amounts[N-1] = shift_amounts[N-1];
+
     //extend + align + sign significands
     for (genvar i = 0; i < N; i++) begin : g_align
-        wire [W-1:0] ext_sigs_in = {sigs_in[i], {W-25{1'b0}}};
+        wire [W-1:0] ext_sigs_in = {gated_sigs_in[i], {W-25{1'b0}}};
         wire fp_sign = ext_sigs_in[W-1];
         wire [W-2:0] fp_sig = ext_sigs_in[W-2:0];
-        wire [W-2:0] adj_sig = fp_sig >> shift_amounts[i];
+        wire [W-2:0] adj_sig = fp_sig >> gated_shift_amounts[i];
         wire [W-1:0] fp_val = fp_sign ? -adj_sig : {1'b0, adj_sig};
         assign sigs_out[i] = fmt_sel ? ext_sigs_in : fp_val;
     end
 
 endmodule
-
-/*
-        wire [23:0] adj_sig = shift_amount[3] ? 24'd0 : full_sig[i] >> shift_amount;      //reducing switching activity (power) by clamping to 0 if
-                                                                                        //input won't make a significant impact on accumulated value
-*/
 
@@ -22,7 +22,12 @@ module VX_tcu_drl_exp_bias (
     output logic exp_low_larger,
     output logic [6:0] raw_exp_diff
 );
-    `UNUSED_VAR({a, b, enable});
+
+    //Power gating inputs to prevent switching activity if not enabled
+    wire [15:0] gated_a = {16{enable}} & a;
+    wire [15:0] gated_b = {16{enable}} & b;
+    wire [2:0] gated_fmt_s = {3{enable}} & fmt_s;
+    `UNUSED_VAR({gated_a, gated_b});
 
     //FP16 exponent addition and bias
     wire [7:0] raw_exp_fp16;
@@ -32,7 +37,7 @@ module VX_tcu_drl_exp_bias (
         .W(8),
         .S(8)
     ) biasexp_fp16(
-        .operands({{3'd0, a[14:10]}, {3'd0, b[14:10]}, fp16_32_conv_bias}),
+        .operands({{3'd0, gated_a[14:10]}, {3'd0, gated_b[14:10]}, fp16_32_conv_bias}),
         .sum     (raw_exp_fp16),
         `UNUSED_PIN (cout)
     );
@@ -44,10 +49,10 @@ module VX_tcu_drl_exp_bias (
     `UNUSED_VAR(raw_exp_bf16_signed);
     VX_csa_tree #(
         .N(3),
-        .W(10),    //8 + log2(3) extend for sign handling
+        .W(10),    //8 + log2(3)-extend for sign handling
         .S(10)
     ) biasexp_bf16(
-        .operands({{2'd0, a[14:7]}, {2'd0, b[14:7]}, neg_bias}),
+        .operands({{2'd0, gated_a[14:7]}, {2'd0, gated_b[14:7]}, neg_bias}),
         .sum     (raw_exp_bf16_signed),
         `UNUSED_PIN (cout)
     );
@@ -60,8 +65,8 @@ module VX_tcu_drl_exp_bias (
         VX_ks_adder #(
             .N(4)
         ) raw_exp_fp8_sub_add (
-            .dataa (a[(i*8)+6 -: 4]),
-            .datab (b[(i*8)+6 -: 4]),
+            .dataa (gated_a[(i*8)+6 -: 4]),
+            .datab (gated_b[(i*8)+6 -: 4]),
             .sum   (raw_exp_fp8_sub[i][3:0]),
             .cout  (raw_exp_fp8_sub[i][4])
         );
@@ -86,8 +91,8 @@ module VX_tcu_drl_exp_bias (
         VX_ks_adder #(
             .N(5)
         ) raw_exp_bf8_sub_add (
-            .dataa (a[(j*8)+6 -: 5]),
-            .datab (b[(j*8)+6 -: 5]),
+            .dataa (gated_a[(j*8)+6 -: 5]),
+            .datab (gated_b[(j*8)+6 -: 5]),
             .sum   (raw_exp_bf8_sub[j][4:0]),
             .cout  (raw_exp_bf8_sub[j][5])
         );
@@ -107,7 +112,7 @@ module VX_tcu_drl_exp_bias (
 
     //Select exp out based on datatype
     always_comb begin
-        case(fmt_s[2:0])
+        case(gated_fmt_s[2:0])
             3'd1: begin
                 raw_exp_y      = raw_exp_fp16;
                 exp_low_larger = 1'bx;