Merge pull request Xilinx#1018 from mmrahorovic/hotfix/vvu_estimations

auphelia · web-flow · commit 82faae7ce305 · 2024-03-29T09:27:17.000Z
VVU estimation function fixes
diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
 import numpy as np
 import os
 from qonnx.core.datatype import DataType
@@ -47,6 +48,84 @@ def get_nodeattr_types(self):
         my_attrs.update(HLSBackend.get_nodeattr_types(self))
         return my_attrs
 
+    def lut_estimation(self):
+        """Calculates resource estimations for LUTs based on:
+        - FINN-R: An End-to-End Deep-Learning Framework for Fast
+        Exploration of Quantized Neural Networks
+        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+        Y. Umuroglu, M. Leeser and K. Vissers
+        - 12. Sep 2018
+        """
+        # TODO add in/out FIFO contributions
+        P = self.get_nodeattr("PE")
+        Q = self.get_nodeattr("SIMD")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        # determine tdt with input and weight data types
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        # parameters from experiments in paper mentioned above
+        c0 = 300
+        c1 = 1.1
+        c2 = 0
+        mmode = self.get_nodeattr("mem_mode")
+        mstyle = self.get_nodeattr("ram_style")
+        if (mmode == "internal_decoupled" and mstyle == "distributed") or (
+            mmode == "internal_embedded" and self.calc_wmem() <= 128
+        ):
+            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+        # multiplication
+        res_type = self.get_nodeattr("resType")
+        if res_type == "dsp":
+            mult_luts = 0
+        else:
+            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+        # adder tree
+        addertree_luts = (W + A) * (2 * Q - 1)
+        # accumulator
+        acc_datatype = self.get_accumulator_datatype()
+        acc_bits = acc_datatype.bitwidth()
+        k_h, k_w = self.get_nodeattr("Kernel")
+        # if accDataType is not set, then it will default to INT32, which would
+        # be a large overestimate in most (if not all) cases. In this scenario,
+        # we would use the minimum accumulator as determined by the data types
+        # bound, derived in https://arxiv.org/abs/2301.13376
+        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+        acc_bits = min(
+            acc_datatype.bitwidth(),
+            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+        )
+        acc_luts = acc_bits
+        # thresholds and threshold comparators
+        thr_luts = 0
+        comp_luts = 0
+        noact = self.get_nodeattr("noActivation")
+        # TODO - add 'ram_style_threshold' node attribute
+        if noact == 0:
+            odt = self.get_output_datatype()
+            B = odt.bitwidth()
+            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
+            comp_luts = (2**B - 1) * acc_bits
+
+        return int(
+            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+        )
+
+    def dsp_estimation(self):
+        # multiplication
+        P = self.get_nodeattr("PE")
+        res_type = self.get_nodeattr("resType")
+        wdt = self.get_weight_datatype()
+        W = wdt.bitwidth()
+        idt = self.get_input_datatype()
+        A = idt.bitwidth()
+        if res_type == "dsp":
+            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
+        else:
+            mult_dsp = 0
+        return int(mult_dsp)
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         mem_mode = self.get_nodeattr("mem_mode")
diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py
@@ -144,8 +144,9 @@ def lut_estimation(self):
         return 0
 
     def dsp_estimation(self):
+        P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
-        return int(np.ceil(Q / 3))
+        return int(P * np.ceil(Q / 3))
 
     def instantiate_ip(self, cmd):
         # instantiate the RTL IP
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -386,84 +386,6 @@ def uram_efficiency_estimation(self):
         uram_est_capacity = uram_est * 72 * 4096
         return wbits / uram_est_capacity
 
-    def lut_estimation(self):
-        """Calculates resource estimations for LUTs based on:
-        - FINN-R: An End-to-End Deep-Learning Framework for Fast
-        Exploration of Quantized Neural Networks
-        - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
-        Y. Umuroglu, M. Leeser and K. Vissers
-        - 12. Sep 2018
-        """
-        # TODO add in/out FIFO contributions
-        P = self.get_nodeattr("PE")
-        Q = self.get_nodeattr("SIMD")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        # determine tdt with input and weight data types
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        # parameters from experiments in paper mentioned above
-        c0 = 300
-        c1 = 1.1
-        c2 = 0
-        mmode = self.get_nodeattr("mem_mode")
-        mstyle = self.get_nodeattr("ram_style")
-        if (mmode == "internal_decoupled" and mstyle == "distributed") or (
-            mmode == "internal_embedded" and self.calc_wmem() <= 128
-        ):
-            c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
-
-        # multiplication
-        res_type = self.get_nodeattr("resType")
-        if res_type == "dsp":
-            mult_luts = 0
-        else:
-            mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
-        # adder tree
-        addertree_luts = (W + A) * (2 * Q - 1)
-        # accumulator
-        acc_datatype = self.get_accumulator_datatype()
-        acc_bits = acc_datatype.bitwidth()
-        k_h, k_w = self.get_nodeattr("Kernel")
-        # if accDataType is not set, then it will default to INT32, which would
-        # be a large overestimate in most (if not all) cases. In this scenario,
-        # we would use the minimum accumulator as determined by the data types
-        # bound, derived in https://arxiv.org/abs/2301.13376
-        alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
-        acc_bits = min(
-            acc_datatype.bitwidth(),
-            np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
-        )
-        acc_luts = acc_bits
-        # thresholds and threshold comparators
-        thr_luts = 0
-        comp_luts = 0
-        noact = self.get_nodeattr("noActivation")
-        # TODO - add 'ram_style_threshold' node attribute
-        if noact == 0:
-            odt = self.get_output_datatype()
-            B = odt.bitwidth()
-            thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
-            comp_luts = (2**B - 1) * acc_bits
-
-        return int(
-            c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
-        )
-
-    def dsp_estimation(self):
-        # multiplication
-        P = self.get_nodeattr("PE")
-        res_type = self.get_nodeattr("resType")
-        wdt = self.get_weight_datatype()
-        W = wdt.bitwidth()
-        idt = self.get_input_datatype()
-        A = idt.bitwidth()
-        if res_type == "dsp":
-            mult_dsp = P * np.ceil((W + A) / 48)  # TODO: more accurate modelling
-        else:
-            mult_dsp = 0
-        return int(mult_dsp)
-
     def get_exp_cycles(self):
         pe = self.get_nodeattr("PE")
         simd = self.get_nodeattr("SIMD")