Skip to content

Commit 82faae7

Browse files
authored
Merge pull request Xilinx#1018 from mmrahorovic/hotfix/vvu_estimations
VVU estimation function fixes
2 parents 10fa01e + ae97e38 commit 82faae7

File tree

3 files changed

+81
-79
lines changed

3 files changed

+81
-79
lines changed

src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

29+
import math
2930
import numpy as np
3031
import os
3132
from qonnx.core.datatype import DataType
@@ -47,6 +48,84 @@ def get_nodeattr_types(self):
4748
my_attrs.update(HLSBackend.get_nodeattr_types(self))
4849
return my_attrs
4950

51+
def lut_estimation(self):
52+
"""Calculates resource estimations for LUTs based on:
53+
- FINN-R: An End-to-End Deep-Learning Framework for Fast
54+
Exploration of Quantized Neural Networks
55+
- M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
56+
Y. Umuroglu, M. Leeser and K. Vissers
57+
- 12. Sep 2018
58+
"""
59+
# TODO add in/out FIFO contributions
60+
P = self.get_nodeattr("PE")
61+
Q = self.get_nodeattr("SIMD")
62+
wdt = self.get_weight_datatype()
63+
W = wdt.bitwidth()
64+
# determine tdt with input and weight data types
65+
idt = self.get_input_datatype()
66+
A = idt.bitwidth()
67+
# parameters from experiments in paper mentioned above
68+
c0 = 300
69+
c1 = 1.1
70+
c2 = 0
71+
mmode = self.get_nodeattr("mem_mode")
72+
mstyle = self.get_nodeattr("ram_style")
73+
if (mmode == "internal_decoupled" and mstyle == "distributed") or (
74+
mmode == "internal_embedded" and self.calc_wmem() <= 128
75+
):
76+
c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
77+
78+
# multiplication
79+
res_type = self.get_nodeattr("resType")
80+
if res_type == "dsp":
81+
mult_luts = 0
82+
else:
83+
mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
84+
# adder tree
85+
addertree_luts = (W + A) * (2 * Q - 1)
86+
# accumulator
87+
acc_datatype = self.get_accumulator_datatype()
88+
acc_bits = acc_datatype.bitwidth()
89+
k_h, k_w = self.get_nodeattr("Kernel")
90+
# if accDataType is not set, then it will default to INT32, which would
91+
# be a large overestimate in most (if not all) cases. In this scenario,
92+
# we would use the minimum accumulator as determined by the data types
93+
# bound, derived in https://arxiv.org/abs/2301.13376
94+
alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
95+
acc_bits = min(
96+
acc_datatype.bitwidth(),
97+
np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
98+
)
99+
acc_luts = acc_bits
100+
# thresholds and threshold comparators
101+
thr_luts = 0
102+
comp_luts = 0
103+
noact = self.get_nodeattr("noActivation")
104+
# TODO - add 'ram_style_threshold' node attribute
105+
if noact == 0:
106+
odt = self.get_output_datatype()
107+
B = odt.bitwidth()
108+
thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
109+
comp_luts = (2**B - 1) * acc_bits
110+
111+
return int(
112+
c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
113+
)
114+
115+
def dsp_estimation(self):
116+
# multiplication
117+
P = self.get_nodeattr("PE")
118+
res_type = self.get_nodeattr("resType")
119+
wdt = self.get_weight_datatype()
120+
W = wdt.bitwidth()
121+
idt = self.get_input_datatype()
122+
A = idt.bitwidth()
123+
if res_type == "dsp":
124+
mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling
125+
else:
126+
mult_dsp = 0
127+
return int(mult_dsp)
128+
50129
def execute_node(self, context, graph):
51130
mode = self.get_nodeattr("exec_mode")
52131
mem_mode = self.get_nodeattr("mem_mode")

src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,9 @@ def lut_estimation(self):
144144
return 0
145145

146146
def dsp_estimation(self):
147+
P = self.get_nodeattr("PE")
147148
Q = self.get_nodeattr("SIMD")
148-
return int(np.ceil(Q / 3))
149+
return int(P * np.ceil(Q / 3))
149150

150151
def instantiate_ip(self, cmd):
151152
# instantiate the RTL IP

src/finn/custom_op/fpgadataflow/vectorvectoractivation.py

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -386,84 +386,6 @@ def uram_efficiency_estimation(self):
386386
uram_est_capacity = uram_est * 72 * 4096
387387
return wbits / uram_est_capacity
388388

389-
def lut_estimation(self):
390-
"""Calculates resource estimations for LUTs based on:
391-
- FINN-R: An End-to-End Deep-Learning Framework for Fast
392-
Exploration of Quantized Neural Networks
393-
- M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
394-
Y. Umuroglu, M. Leeser and K. Vissers
395-
- 12. Sep 2018
396-
"""
397-
# TODO add in/out FIFO contributions
398-
P = self.get_nodeattr("PE")
399-
Q = self.get_nodeattr("SIMD")
400-
wdt = self.get_weight_datatype()
401-
W = wdt.bitwidth()
402-
# determine tdt with input and weight data types
403-
idt = self.get_input_datatype()
404-
A = idt.bitwidth()
405-
# parameters from experiments in paper mentioned above
406-
c0 = 300
407-
c1 = 1.1
408-
c2 = 0
409-
mmode = self.get_nodeattr("mem_mode")
410-
mstyle = self.get_nodeattr("ram_style")
411-
if (mmode == "internal_decoupled" and mstyle == "distributed") or (
412-
mmode == "internal_embedded" and self.calc_wmem() <= 128
413-
):
414-
c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
415-
416-
# multiplication
417-
res_type = self.get_nodeattr("resType")
418-
if res_type == "dsp":
419-
mult_luts = 0
420-
else:
421-
mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
422-
# adder tree
423-
addertree_luts = (W + A) * (2 * Q - 1)
424-
# accumulator
425-
acc_datatype = self.get_accumulator_datatype()
426-
acc_bits = acc_datatype.bitwidth()
427-
k_h, k_w = self.get_nodeattr("Kernel")
428-
# if accDataType is not set, then it will default to INT32, which would
429-
# be a large overestimate in most (if not all) cases. In this scenario,
430-
# we would use the minimum accumulator as determined by the data types
431-
# bound, derived in https://arxiv.org/abs/2301.13376
432-
alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
433-
acc_bits = min(
434-
acc_datatype.bitwidth(),
435-
np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
436-
)
437-
acc_luts = acc_bits
438-
# thresholds and threshold comparators
439-
thr_luts = 0
440-
comp_luts = 0
441-
noact = self.get_nodeattr("noActivation")
442-
# TODO - add 'ram_style_threshold' node attribute
443-
if noact == 0:
444-
odt = self.get_output_datatype()
445-
B = odt.bitwidth()
446-
thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
447-
comp_luts = (2**B - 1) * acc_bits
448-
449-
return int(
450-
c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
451-
)
452-
453-
def dsp_estimation(self):
454-
# multiplication
455-
P = self.get_nodeattr("PE")
456-
res_type = self.get_nodeattr("resType")
457-
wdt = self.get_weight_datatype()
458-
W = wdt.bitwidth()
459-
idt = self.get_input_datatype()
460-
A = idt.bitwidth()
461-
if res_type == "dsp":
462-
mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling
463-
else:
464-
mult_dsp = 0
465-
return int(mult_dsp)
466-
467389
def get_exp_cycles(self):
468390
pe = self.get_nodeattr("PE")
469391
simd = self.get_nodeattr("SIMD")

0 commit comments

Comments
 (0)