jmitrevs
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion b/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 12 additions & 2 deletions b/‎README.md‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎docs/api/configuration.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/api/configuration.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/reference.rst‎
Lines changed: 25 additions & 4 deletions b/‎docs/reference.rst‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎hls4ml/backends/fpga/passes/clone.py‎
Lines changed: 6 additions & 8 deletions b/‎hls4ml/backends/fpga/passes/clone.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎hls4ml/backends/fpga/passes/fix_softmax_table_size.py‎
Lines changed: 65 additions & 0 deletions b/‎hls4ml/backends/fpga/passes/fix_softmax_table_size.py‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎hls4ml/backends/fpga/passes/repack_stream.py‎
Lines changed: 2 additions & 0 deletions b/‎hls4ml/backends/fpga/passes/repack_stream.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎hls4ml/backends/quartus/quartus_backend.py‎
Lines changed: 1 addition & 0 deletions b/‎hls4ml/backends/quartus/quartus_backend.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎hls4ml/backends/vivado/passes/convolution_templates.py‎
Lines changed: 2 additions & 0 deletions b/‎hls4ml/backends/vivado/passes/convolution_templates.py‎
Lines changed: 2 additions & 0 deletions
@@ -2,15 +2,15 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 23.9.1
+  rev: 23.11.0
   hooks:
   - id: black
     language_version: python3
     args: ['--line-length=125',
            '--skip-string-normalization']
 
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v4.5.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
@@ -30,7 +30,7 @@ repos:
     args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.14.0
+  rev: v3.15.0
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]
 
@@ -4,7 +4,7 @@ type: software
 authors:
 - given-names: "FastML Team"
 title: "hls4ml"
-version: "v0.7.1"
+version: "v0.8.0"
 doi: 10.5281/zenodo.1201549
 repository-code: "https://github.com/fastmachinelearning/hls4ml"
 url: "https://fastmachinelearning.org/hls4ml"
 
@@ -1,4 +1,4 @@
-<p float="left">
+<p align="center">
    <img src="https://github.com/fastmachinelearning/fastmachinelearning.github.io/raw/master/images/hls4ml_logo.svg" alt="hls4ml" width="400"/>
 </p>
 
@@ -69,7 +69,7 @@ If you use this software in a publication, please cite the software
   title        = {fastmachinelearning/hls4ml},
   year         = 2023,
   publisher    = {Zenodo},
-  version      = {v0.7.1},
+  version      = {v0.8.0},
   doi          = {10.5281/zenodo.1201549},
   url          = {https://github.com/fastmachinelearning/hls4ml}
 }
@@ -140,3 +140,13 @@ binary/ternary networks:
 If you benefited from participating in our community, we ask that you please acknowledge the Fast Machine Learning collaboration, and particular individuals who helped you, in any publications.
 Please use the following text for this acknowledgment:
   > We acknowledge the Fast Machine Learning collective as an open community of multi-domain experts and collaborators. This community and \<names of individuals\>, in particular, were important for the development of this project.
+
+# Funding
+We gratefully acknowledge previous and current support from the U.S. National Science Foundation (NSF) Harnessing the Data Revolution (HDR) Institute for <a href="https://a3d3.ai">Accelerating AI Algorithms for Data Driven Discovery (A3D3)</a> under Cooperative Agreement No. <a href="https://www.nsf.gov/awardsearch/showAward?AWD_ID=2117997">OAC-2117997</a>, U.S. Department of Energy (DOE) Office of Science, Office of Advanced Scientific Computing Research under the Real‐time Data Reduction Codesign at the Extreme Edge for Science (XDR) Project (<a href="https://science.osti.gov/-/media/grants/pdf/foas/2021/SC_FOA_0002501.pdf">DE-FOA-0002501</a>), DOE Office of Science, Office of High Energy Physics Early Career Research Program (<a href="https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Common/ViewPublicAbstract.aspx?rv=df0ae4ab-a46e-481a-9acc-3856b6b041e5&rtc=24&PRoleId=10">DE-SC0021187</a>, DE-0000247070), and the European Research Council (ERC) under the European Union's Horizon 2020 research and innovation program (Grant No. <a href="https://doi.org/10.3030/772369">772369</a>).
+
+<p align="center">
+<img src="https://github.com/fastmachinelearning/hls4ml/assets/29201053/bd1217d4-9930-47b7-8917-ad3fc430c75d" alt="A3D3" width="130"/>
+<img src="https://github.com/fastmachinelearning/hls4ml/assets/4932543/16e77374-9829-40a8-800e-8d12018a7cb3" alt="NSF" width="130"/>
+<img src="https://github.com/fastmachinelearning/hls4ml/assets/4932543/de6ca6ea-4d1c-4c56-9d93-f759914bbbf9" alt="DOE" width="130"/>
+<img src="https://github.com/fastmachinelearning/hls4ml/assets/4932543/7a369971-a381-4bb8-932a-7162b173cbac" alt="ERC" width="130"/>
+</p>
@@ -70,7 +70,7 @@ It looks like this:
    OutputPredictions: keras/KERAS_3layer_predictions.dat
 
    # Backend section (Vivado backend)
-   Part: xcku115-flvb2104-2-i
+   Part: xcvu13p-flga2577-2-e
    ClockPeriod: 5
    IOType: io_parallel # options: io_parallel/io_stream
 
@@ -97,7 +97,7 @@ There are a number of configuration options that you have.  Let's go through the
 The backend-specific section of the configuration depends on the backend. You can get a starting point for the necessary settings using, for example `hls4ml.templates.get_backend('Vivado').create_initial_config()`.
 For Vivado backend the options are:
 
-* **Part**\ : the particular FPGA part number that you are considering, here it's a Xilinx Virtex-7 FPGA
+* **Part**\ : the particular FPGA part number that you are considering, here it's a Xilinx Virtex UltraScale+ VU13P FPGA
 * **ClockPeriod**\ : the clock period, in ns, at which your algorithm runs
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
 
@@ -1,6 +1,6 @@
-============================
-Citation and Contributors
-============================
+===========================================
+Citation, Acknowledgments, and Contributors
+===========================================
 
 
 Citation
@@ -14,7 +14,7 @@ If you use this software in a publication, please cite the software
     title        = {fastmachinelearning/hls4ml},
     year         = 2023,
     publisher    = {Zenodo},
-    version      = {v0.7.1},
+    version      = {v0.8.0},
     doi          = {10.5281/zenodo.1201549},
     url          = {https://github.com/fastmachinelearning/hls4ml}
     }
@@ -90,9 +90,30 @@ Acknowledgments
 ===============
 If you benefited from participating in our community, we ask that you please acknowledge the Fast Machine Learning collaboration, and particular individuals who helped you, in any publications.
 Please use the following text for this acknowledgment:
+
   We acknowledge the Fast Machine Learning collective as an open community of multi-domain experts and collaborators. This community and \<names of individuals\>, in particular, were important for the development of this project.
 
 
+Funding
+=======
+We gratefully acknowledge previous and current support from the U.S. National Science Foundation (NSF) Harnessing the Data Revolution (HDR) Institute for `Accelerating AI Algorithms for Data Driven Discovery (A3D3) <https://a3d3.ai>`_ under Cooperative Agreement No. `OAC-2117997 <https://www.nsf.gov/awardsearch/showAward?AWD_ID=2117997>`_, U.S. Department of Energy (DOE) Office of Science, Office of Advanced Scientific Computing Research under the Real‐time Data Reduction Codesign at the Extreme Edge for Science (XDR) Project (`DE-FOA-0002501 <https://science.osti.gov/-/media/grants/pdf/foas/2021/SC_FOA_0002501.pdf>`_), DOE Office of Science, Office of High Energy Physics Early Career Research Program (`DE-SC0021187 <https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Common/ViewPublicAbstract.aspx?rv=df0ae4ab-a46e-481a-9acc-3856b6b041e5&rtc=24&PRoleId=10>`_, DE-0000247070), and the European Research Council (ERC) under the European Union's Horizon 2020 research and innovation program (Grant No. `772369 <https://doi.org/10.3030/772369>`_).
+
+.. image:: https://github.com/fastmachinelearning/hls4ml/assets/4932543/d4b6e2a3-3537-4413-9809-8153a7d624d6
+    :height: 200
+    :align: center
+
+.. image:: https://github.com/fastmachinelearning/hls4ml/assets/4932543/16e77374-9829-40a8-800e-8d12018a7cb3
+    :height: 200
+    :align: center
+
+.. image:: https://github.com/fastmachinelearning/hls4ml/assets/4932543/de6ca6ea-4d1c-4c56-9d93-f759914bbbf9
+    :height: 200
+    :align: center
+
+.. image:: https://github.com/fastmachinelearning/hls4ml/assets/4932543/7a369971-a381-4bb8-932a-7162b173cbac
+    :height: 200
+    :align: center
+
 Contributors
 ============
 
 
@@ -20,21 +20,19 @@ def initialize(self):
 class CloneFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(Clone, include_header=clone_include_list)
-        self.template = None  # to be filled once number of clones known
 
     def format(self, node):
         params = self._default_function_params(node)
         for i, _output in enumerate(node.outputs):
             params['output' + str(i + 1)] = node.variables[node.outputs[i]].name
 
-        if self.template is None:
-            self.template = (
-                'nnet::clone_stream<{input_t}, {output_t}, {size}>({input}, '
-                + ', '.join(['{output' + str(i + 1) + '}' for i in range(len(node.outputs))])
-                + ');'
-            )
+        template = (
+            'nnet::clone_stream<{input_t}, {output_t}, {size}>({input}, '
+            + ', '.join(['{output' + str(i + 1) + '}' for i in range(len(node.outputs))])
+            + ');'
+        )
 
-        return self.template.format(**params)
+        return template.format(**params)
 
 
 def register_clone(backend):
 
@@ -0,0 +1,65 @@
+import warnings
+
+from hls4ml.model.layers import Layer, Softmax
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class FixSoftmaxTableSize(OptimizerPass):
+    def match(self, node):
+        return isinstance(node, Softmax)
+
+    def transform(self, model, node: Layer):
+        inp_layer = node.get_input_node()  # type: ignore
+        if not isinstance(inp_layer, Layer):
+            raise RuntimeError(f'Softmax layer {node.name} does not have an input layer')
+
+        input_bw: int = inp_layer.get_attr('result_t').precision.width  # type: ignore
+        table_bw: int = node.get_attr('inv_table_t').precision.width  # type: ignore
+        table_size = int(node.get_attr('table_size'))  # type: ignore
+
+        backend = model.config.config['Backend']
+
+        # Somehow, Intel want one extra bits for the table.
+        # I don't know why but if not simulation will crash with segmentation fault.
+        backend_limitation = -1 if backend == 'Quartus' else 0
+
+        if 2 ** (min(input_bw, table_bw) + backend_limitation) < table_size:
+            # If table size is too large w.r.t. input bitwidth and table bitwidth,
+            # reduce table size to avoid undefined behavior when cutting indices from,
+            # fixed point number.
+            node.set_attr('table_size', str(2 ** (min(input_bw, table_bw) + backend_limitation)))
+            if 2**input_bw < table_size:
+                # The warning message does not have to be looking like this, but you are asking
+                # 125 characters long line.
+                warnings.warn(
+                    (
+                        f"Softmax layer {node.name} table size is too large for input"
+                        f"bitwidth {input_bw}. Setting table size to {2**input_bw}."
+                        "To avoid this warning, please increase input bitwidth or"
+                        "decrease table size."
+                    ),
+                    stacklevel=1,
+                )
+            if 2**table_bw < table_size:
+                warnings.warn(
+                    (
+                        f"Softmax layer {node.name} table size is too large for input"
+                        f"bitwidth {input_bw}. Setting table size to {2**input_bw}."
+                        "To avoid this warning, please increase input bitwidth or"
+                        "decrease table size."
+                    ),
+                    stacklevel=1,
+                )
+            if backend == 'Quartus':
+                warnings.warn(
+                    (
+                        "Quartus backend's table size is half of 2^min(input_bw-1,table_bw-1)"
+                        " instead of 2^min(input_bw,table_bw)."
+                    ),
+                    stacklevel=1,
+                )
+            return False
+
+
+def register_softmax__table_size_fix(backend):
+    backend.register_pass('fix_softmax_table_size', FixSoftmaxTableSize)
@@ -59,6 +59,8 @@ def transform(self, model, node):
 
         # Insert new Repack node instead of Reshape
         repack_layer = model.make_node(Repack, 'repack_' + node.name, attrs, node.inputs.copy())
+        # As result_t attribute is not honored by type conversion, set it manually here
+        repack_layer.attributes[repack_layer.name].type = node.attributes[node.name].type
         model.replace_node(node, repack_layer)
 
         return True
@@ -72,6 +72,7 @@ def _register_flows(self):
             'quartus:inplace_parallel_reshape',
             'quartus:inplace_stream_flatten',
             'quartus:skip_softmax',
+            'quartus:fix_softmax_table_size',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
 
 
@@ -41,6 +41,8 @@
     static const unsigned out_width = {out_width};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
+    static const unsigned multiplier_limit =
+        DIV_ROUNDUP(kernel_size * n_chan * n_filt, reuse_factor) - n_zeros / reuse_factor;
     static const bool store_weights_in_bram = false;
     static const unsigned strategy = nnet::{strategy};
     static const nnet::conv_implementation implementation = nnet::conv_implementation::{implementation};
Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ def _register_flows(self):`
`72`	`72`	`'quartus:inplace_parallel_reshape',`
`73`	`73`	`'quartus:inplace_stream_flatten',`
`74`	`74`	`'quartus:skip_softmax',`
	`75`	`+ 'quartus:fix_softmax_table_size',`
`75`	`76`	`]`
`76`	`77`	`optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)`
`77`	`78`