diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b3d872190..0601a84b2d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 24.8.0
+  rev: 24.10.0
   hooks:
   - id: black
     language_version: python3
@@ -10,7 +10,7 @@ repos:
            '--skip-string-normalization']
 
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v5.0.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
@@ -30,13 +30,13 @@ repos:
     args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.17.0
+  rev: v3.19.0
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]
 
 - repo: https://github.com/asottile/setup-cfg-fmt
-  rev: v2.5.0
+  rev: v2.7.0
   hooks:
   - id: setup-cfg-fmt
 
@@ -50,7 +50,7 @@ repos:
            '--extend-ignore=E203,T201']  # E203 is not PEP8 compliant
 
 - repo: https://github.com/mgedmin/check-manifest
-  rev: "0.49"
+  rev: "0.50"
   hooks:
   - id: check-manifest
     stages: [manual]
diff --git a/CITATION.cff b/CITATION.cff
index 9e1880f03f..91bf036a1d 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -4,7 +4,7 @@ type: software
 authors:
 - given-names: "FastML Team"
 title: "hls4ml"
-version: "v0.8.1"
+version: "v1.0.0"
 doi: 10.5281/zenodo.1201549
 repository-code: "https://github.com/fastmachinelearning/hls4ml"
 url: "https://fastmachinelearning.org/hls4ml"
diff --git a/Jenkinsfile b/Jenkinsfile
index b943ce3480..5ca79a484c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -16,7 +16,7 @@ pipeline {
           sh '''#!/bin/bash --login
               conda activate hls4ml-py310
               conda install -y jupyterhub pydot graphviz pytest pytest-cov
-              pip install pytest-randomly jupyter onnx>=1.4.0 matplotlib pandas seaborn pydigitalwavetools==1.1 pyyaml tensorflow==2.14 qonnx torch git+https://github.com/google/qkeras.git pyparsing
+              pip install pytest-randomly jupyter onnx>=1.4.0 matplotlib pandas seaborn pydigitalwavetools==1.1 pyyaml tensorflow==2.14 qonnx torch git+https://github.com/jmitrevs/qkeras.git@qrecurrent_unstack pyparsing
               pip install -U ../ --user
               ./convert-keras-models.sh -x -f keras-models.txt
               pip uninstall hls4ml -y'''
diff --git a/README.md b/README.md
index 606e824d09..fd96763476 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,9 @@ If you have any questions, comments, or ideas regarding hls4ml or just want to s
 
 # Documentation & Tutorial
 
-For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/)
+For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/](https://fastmachinelearning.org/hls4ml/).
+
+For introductory material on FPGAs, HLS and ML inferences using hls4ml, check out the [video](https://www.youtube.com/watch?v=2y3GNY4tf7A&ab_channel=SystemsGroupatETHZ%C3%BCrich).
 
 Detailed tutorials on how to use `hls4ml`'s various functionalities can be found [here](https://github.com/hls-fpga-machine-learning/hls4ml-tutorial).
 
@@ -49,8 +51,8 @@ hls_model = hls4ml.converters.keras_to_hls(config)
 hls4ml.utils.fetch_example_list()
 ```
 
-### Building a project with Xilinx Vivado HLS (after downloading and installing from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html))
-Note: Vitis HLS is not yet supported. Vivado HLS versions between 2018.2 and 2020.1 are recommended.
+### Building a project.
+We will build the project using Xilinx Vivado HLS, which can be downloaded and installed from [here](https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html). Alongside Vivado HLS, hls4ml also supports Vitis HLS, Intel HLS, Catapult HLS and has some experimental support dor Intel oneAPI. The target back-end can be changed using the argument backend when building the model.
 
 ```Python
 # Use Vivado HLS to synthesize the model
@@ -61,15 +63,19 @@ hls_model.build()
 hls4ml.report.read_vivado_report('my-hls-test')
 ```
 
+# FAQ
+
+List of frequently asked questions and common HLS synthesis can be found [here](https://fastmachinelearning.org/hls4ml/faq.html)
+
 # Citation
 If you use this software in a publication, please cite the software
 ```bibtex
 @software{fastml_hls4ml,
   author       = {{FastML Team}},
   title        = {fastmachinelearning/hls4ml},
-  year         = 2023,
+  year         = 2024,
   publisher    = {Zenodo},
-  version      = {v0.8.1},
+  version      = {v1.0.0},
   doi          = {10.5281/zenodo.1201549},
   url          = {https://github.com/fastmachinelearning/hls4ml}
 }
diff --git a/docs/advanced/auto.rst b/docs/advanced/auto.rst
new file mode 100644
index 0000000000..f944a11e54
--- /dev/null
+++ b/docs/advanced/auto.rst
@@ -0,0 +1,22 @@
+=============================
+Automatic precision inference
+=============================
+
+The automatic precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.infer_precision.InferPrecisionTypes`) attempts to infer the appropriate
+widths for a given precision. It is initiated by setting a precision in the configuration as ``'auto'``. (Note, only layer-level precisions can be set to ``'auto'``,
+not model-level.)  Functions like :py:class:`~hls4ml.utils.config.config_from_keras_model`, :py:class:`~hls4ml.utils.config.config_from_onnx_model`,
+and :py:class:`~hls4ml.utils.config.config_from_pytorch_model` automatically set most precisions to ``'auto'`` if the ``'name'`` granularity is used.
+
+.. note::
+    It is recommended to pass the backend to the ``config_from_*`` functions so that they can properly extract all the configurable precisions.
+
+The approach taken by the precision inference is to set accumulator (the internal variable used to accumulate values in the matrix multiplications) and other precisions
+to never truncate, using only the bitwidths of the inputs (not the values). This is quite conservative, especially in cases where post-training quantization is used, or
+if the bit widths were set fairly loosely. The recommended action in that case is to edit the configuration and explicitly set some widths in it, potentially in an iterative process
+after profiling the data. Another option is to pass a maximum precision using the ``max_precison`` parameter of the ``config_form_*`` functions. Then the automatic precision
+inference will never set a bitwdith larger than the bitwidth of the ``max_precision`` or an integer part larger than the integer part of the ``max_precision`` that is passed.
+(The bitwidth and integer parts of the ``max_precision`` are treated separately.)
+
+When manually setting bitdwidths, the accumulator can overflow, and the precision may need to be reduced. For the accumulator, it is usually a bad idea to explicitly
+enable rounding or saturation modes since it dramatically increases the execution time. For other types (e.g. output types or weight types), however, rounding and saturation handling
+can be enabled as needed.
diff --git a/docs/advanced/bramfactor.rst b/docs/advanced/bramfactor.rst
new file mode 100644
index 0000000000..37fe766060
--- /dev/null
+++ b/docs/advanced/bramfactor.rst
@@ -0,0 +1,42 @@
+==================================
+Loading weights from external BRAM
+==================================
+
+.. note::
+    This feature is being evaluated for re-implementation. We welcome feedback from users how to make the implementation more flexible.
+
+``hls4ml`` can optionally store weights in BRAMs external to the design. This is supported in Vivado/Vitis and Catapult backends. It is the responsibility of the user to ensure the weights are properly loaded during the operation of the design.
+
+The feature works as a threshold, exposed through a ``BramFactor`` config parameter. Layers with more weights above the threshold will be exposed as BRAM interface. Consider the following code:
+
+.. code-block:: Python
+
+    model = tf.keras.models.Sequential()
+    model.add(Dense(10, activation="relu", input_shape=(12,), name="dense_1"))
+    model.add(Dense(20, activation="relu", name="dense_2"))
+    model.add(Dense(5, activation="softmax", name="dense_3"))
+    model.compile(optimizer='adam', loss='mse')
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    config["Model"]["Strategy"] = "Resource"
+    config["Model"]["BramFactor"] = 100
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+
+Having set ``BramFactor=100``, only layers with more than 100 weights will be exposed as external BRAM, in this case layers ``dense_1`` and ``dense_2``. ``BramFactor`` can currently be only set at the model level. The generated code will now have weights as part of the interface.
+
+.. code-block:: C++
+
+    void myproject(
+        hls::stream<input_t> &dense_1_input,
+        hls::stream<result_t> &layer7_out,
+        model_default_t w2[120],
+        model_default_t w4[200]
+    ) {
+        #pragma HLS INTERFACE axis port=dense_1_input,layer7_out
+        #pragma HLS INTERFACE bram port=w2,w4
+        ...
+
+When integrating the design, users can use the exposed interface to implement weight reloading scheme.
diff --git a/docs/advanced/hgq.rst b/docs/advanced/hgq.rst
new file mode 100644
index 0000000000..dd0faad7dc
--- /dev/null
+++ b/docs/advanced/hgq.rst
@@ -0,0 +1,49 @@
+===================================
+High Granularity Quantization (HGQ)
+===================================
+
+.. image:: https://github.com/calad0i/HGQ/actions/workflows/sphinx-build.yml/badge.svg
+   :target: https://calad0i.github.io/HGQ/
+.. image:: https://badge.fury.io/py/hgq.svg
+   :target: https://badge.fury.io/py/hgq
+.. image:: https://img.shields.io/badge/arXiv-2405.00645-b31b1b.svg
+   :target: https://arxiv.org/abs/2405.00645
+
+`High Granularity Quantization (HGQ) <https://github.com/calad0i/HGQ/>`_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By leveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
+
+.. image:: https://calad0i.github.io/HGQ/_images/overview.svg
+   :alt: Overview of HGQ
+   :align: center
+
+Conversion of models made with HGQ library is fully supported. The HGQ models are first converted to proxy model format, which can then be parsed by hls4ml bit-accurately. Below is an example of how to create a model with HGQ and convert it to hls4ml model.
+
+.. code-block:: Python
+
+   import keras
+   from HGQ.layers import HDense, HDenseBatchNorm, HQuantize
+   from HGQ import ResetMinMax, FreeBOPs
+
+   model = keras.models.Sequential([
+      HQuantize(beta=1.e-5),
+      HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+      HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+      HDense(10, beta=1.e-5),
+   ])
+
+    opt = keras.optimizers.Adam(learning_rate=0.001)
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
+    callbacks = [ResetMinMax(), FreeBOPs()]
+
+    model.fit(..., callbacks=callbacks)
+
+    from HGQ import trace_minmax, to_proxy_model
+    from hls4ml.converters import convert_from_keras_model
+
+    trace_minmax(model, x_train, cover_factor=1.0)
+    proxy = to_proxy_model(model, aggressive=True)
+
+    model_hls = convert_from_keras_model(proxy, backend='vivado',output_dir=... ,part=...)
+
+
+An interactive example of HGQ can be found in the `kaggle notebook <https://www.kaggle.com/code/calad0i/small-jet-tagger-with-hgq-1>`_. Full documentation can be found at `calad0i.github.io/HGQ <https://calad0i.github.io/HGQ/>`_.
diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
index a75224b8cc..302d646023 100644
--- a/docs/advanced/model_optimization.rst
+++ b/docs/advanced/model_optimization.rst
@@ -13,11 +13,11 @@ The code block below showcases three use cases of the hls4ml Optimization API -
     from tensorflow.keras.optimizers import Adam
     from tensorflow.keras.metrics import CategoricalAccuracy
     from tensorflow.keras.losses import CategoricalCrossentropy
-    from hls4ml.optimization.keras import optimize_model
-    from hls4ml.optimization.keras.utils import get_model_sparsity
-    from hls4ml.optimization.attributes import get_attributes_from_keras_model
-    from hls4ml.optimization.objectives import ParameterEstimator
-    from hls4ml.optimization.scheduler import PolynomialScheduler
+    from hls4ml.optimization.dsp_aware_pruning.keras import optimize_model
+    from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_model_sparsity
+    from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model
+    from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator
+    from hls4ml.optimization.dsp_aware_pruning.scheduler import PolynomialScheduler
     # Define baseline model and load data
     # X_train, y_train = ...
     # X_val, y_val = ...
@@ -75,7 +75,7 @@ To optimize GPU FLOPs, the code is similar to above:
 
 .. code-block:: Python
 
-    from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator
+    from hls4ml.optimization.dsp_aware_pruning.objectives.gpu_objectives import GPUFLOPEstimator
 
     # Optimize model
     # Note the change from ParameterEstimator to GPUFLOPEstimator
@@ -98,7 +98,7 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
 .. code-block:: Python
 
     from hls4ml.utils.config import config_from_keras_model
-    from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
+    from hls4ml.optimization.dsp_aware_pruning.objectives.vivado_objectives import VivadoDSPEstimator
 
     # Note the change from optimize_model to optimize_keras_model_for_hls4ml
     # The function optimize_keras_model_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
@@ -124,11 +124,11 @@ Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
     acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
     print(f'Optimized Keras accuracy: {acc_optimized}')
 
-There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
-Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilization and VivadoMultiObjectiveEstimator, aimed at optimizing BRAM and DSP utilization.
+Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesizing HLS, by modifying the config:
 
 .. code-block:: Python
 
     hls_config = config_from_keras_model(optimized_model)
-    hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
-    # Any addition hls4ml config, such as strategy, reuse factor etc...
+    hls_config['Model']['Strategy'] = 'Unrolled'
+    # Any addition hls4ml config, reuse factor etc...
diff --git a/docs/api/profiling.rst b/docs/advanced/profiling.rst
similarity index 100%
rename from docs/api/profiling.rst
rename to docs/advanced/profiling.rst
diff --git a/docs/command.rst b/docs/api/command.rst
similarity index 97%
rename from docs/command.rst
rename to docs/api/command.rst
index cb9d346e31..1f821b7f35 100644
--- a/docs/command.rst
+++ b/docs/api/command.rst
@@ -50,7 +50,7 @@ hls4ml config
 
    hls4ml config [-h] [-m MODEL] [-w WEIGHTS] [-o OUTPUT]
 
-This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <setup>` page for more details on how to write a configuration file.
+This creates a conversion configuration file. Visit Configuration section of the :doc:`Setup <../intro/setup>` page for more details on how to write a configuration file.
 
 **Arguments**
 
diff --git a/docs/api/concepts.rst b/docs/api/concepts.rst
new file mode 100644
index 0000000000..9087470cf3
--- /dev/null
+++ b/docs/api/concepts.rst
@@ -0,0 +1,78 @@
+========
+Concepts
+========
+
+How it Works
+----------------------
+
+.. image:: ../img/nn_map_paper_fig_2.png
+   :width: 70%
+   :align: center
+
+
+Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
+
+
+.. math::
+
+   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
+
+With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
+The activations, if nontrivial, are precomputed.
+
+To ensure optimal performance, the user can control aspects of their model, principally:
+
+
+* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
+* **Precision** - Define the :doc:`precision <../advanced/profiling>` of the calculations in your model
+* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
+* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
+
+
+.. image:: ../img/reuse_factor_paper_fig_8.png
+   :width: 70%
+   :align: center
+
+
+Often, these decisions will be hardware dependent to maximize performance.
+Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
+Also important to note is the use of fixed point arithmetic in ``hls4ml``.
+This improves processing speed relative to floating point implementations.
+The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
+
+
+Frontends and Backends
+----------------------
+
+``hls4ml`` has a concept of a **frontend** that parses the input NN into an internal model graph, and a **backend** that controls
+what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
+parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
+currently supported frontends and backends or the dedicated sections for each frontend/backend.
+
+
+I/O Types
+---------
+
+``hls4ml`` supports multiple styles for handling data transfer to/from the network and between layers, known as the ``io_type``.
+
+io_parallel
+^^^^^^^^^^^
+In this processing style, data is passed in parallel between the layers. Conceptually this corresponds to the C/C++ array where all elements can be accessed ay any time. This style allows for maximum parallelism and is well suited for MLP networks and small CNNs which aim for lowest latency. Due to the impact of parallel processing on resource utilization on FPGAs, the synthesis may fail for larger networks.
+
+io_stream
+^^^^^^^^^
+As opposed to the parallel processing style, in ``io_stream`` mode data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending data between layers is recommended for larger CNN and RNN networks. For one-dimensional ``Dense`` layers, all the inputs are streamed in parallel as a single array.
+
+With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
+The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
+Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
+By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
+
+In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
+in the :ref:`FIFO Buffer Depth Optimization` section.
+
+
+Strategy
+---------
+
+**Strategy** in ``hls4ml`` refers to the implementation of core matrix-vector multiplication routine, which can be latency-oriented, resource-saving oriented, or specialized. Different strategies will have an impact on overall latency and resource consumption of each layer and users are advised to choose based on their design goals. The availability of particular strategy for a layer varies across backends, see the :doc:`Attributes <../ir/attributes>` section for a complete list of available strategies per-layer and per-backend.
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index 72d677d196..1bc8f0676c 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -34,20 +34,46 @@ Using hls4ml, you can quickly generate a simple configuration dictionary from a
    import hls4ml
    config = hls4ml.utils.config_from_keras_model(model, granularity='model')
 
-This python dictionary can be edited as needed. A more advanced configuration can be generated by, for example:
+This python dictionary can be edited as needed. More advanced configuration can be generated by, for example for ONNX models:
 
 .. code-block:: python
 
    import hls4ml
-   config = hls4ml.utils.config_from_keras_model(
+   config = hls4ml.utils.config_from_onnx_model(
         model,
         granularity='name',
         default_precision='fixed<16,6>',
         backend='Vitis')
 
-This will include per-layer configuration based on the model. Including the backend is recommended because some configation options depend on the backend. Note, the precisions at the
-higher granularites usually default to 'auto', which means that ``hls4ml`` will try to set it automatically. Note that higher granularity settings take precendence
-over model-level settings. See :py:class:`~hls4ml.utils.config.config_from_keras_model` for more information on the various options.
+for Keras models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_keras_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='oneAPI')
+
+or for PyTorch models:
+
+.. code-block:: python
+
+   import hls4ml
+   config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        granularity='name',
+        default_precision='fixed<16,6>',
+        backend='Catapult')
+
+
+The ``name`` granularity includes per-layer configuration based on the model. A ``'name'`` granularity is generally recommended because it allows for more turning, and also because it allows
+for automatic setting of precisions.  The layer-level precisions with the ``'name'`` granularity default to ``'auto'``, which means that hls4ml will try to set it automatically
+(see :ref:`Automatic precision inference`). Note that layer-level settings take precedence over model-level settings. A ``'name'`` granularity is required for QKeras
+and QONNX model parsing. Passing the backend to these functions is recommended because some configuration options depend on the backend. See :py:class:`~hls4ml.utils.config.config_from_keras_model`
+and similar for more information on the various options. Note specifically the documentation of :py:class:`~hls4ml.utils.config.config_from_pytorch_model` on how to handle differences in input data
+formats between pytorch and keras (hls4ml follows keras conventions internally).
 
 One can override specific values before using the configuration:
 
@@ -59,7 +85,7 @@ Or to set the precision of a specific layer's weight:
 
 .. code-block:: python
 
-   config['LayerName']['fc1']['Precision']['weight'] = 'ap_fixed<8,4>'
+   config['LayerName']['fc1']['Precision']['weight'] = 'fixed<8,4>'
 
 To better understand how the configuration hierachy works, refer to the next section for more details.
 
@@ -75,7 +101,7 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
-See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options.
+See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
 
@@ -85,7 +111,7 @@ See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information
 2.1 Top Level Configuration
 ---------------------------
 
-Configuration files are YAML files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
+One can also use YAML configuration files in hls4ml (\ ``*.yml``\ ). An example configuration file is `here <https://github.com/hls-fpga-machine-learning/example-models/blob/master/keras-config.yml>`__.
 
 It looks like this:
 
@@ -108,7 +134,7 @@ It looks like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
        Strategy: Latency
      LayerType:
@@ -124,7 +150,7 @@ There are a number of configuration options that you have.  Let's go through the
 * **ProjectName**\ : the name of the HLS project IP that is produced
 * **KerasJson/KerasH5**\ : for Keras, the model architecture and weights are stored in a ``json`` and ``h5`` file.  The path to those files are required here.
   We also support keras model's file obtained just from ``model.save()``. In this case you can just supply the ``h5`` file in ``KerasH5:`` field.
-* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create aritificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
+* **InputData/OutputPredictions**\ : path to your input/predictions of the model. If none is supplied, then hls4ml will create artificial data for simulation. The data used above in the example can be found `here <https://cernbox.cern.ch/index.php/s/2LTJVVwCYFfkg59>`__. We also support ``npy`` data files. We welcome suggestions on more input data types to support.
 
 The backend-specific section of the configuration depends on the backend. You can get a starting point for the necessary settings using, for example `hls4ml.templates.get_backend('Vivado').create_initial_config()`.
 For Vivado backend the options are:
@@ -134,10 +160,13 @@ For Vivado backend the options are:
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
 * **HLSConfig**\: the detailed configuration of precision and parallelism, including:
+
   * **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval
-  * **Strategy**\ : Optimization strategy on FPGA, either "Latency" or "Resource". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
-  * **Precision**\ : this defines the precsion of your inputs, outputs, weights and biases. It is denoted by ``ap_fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits.
-  Additionally, integers in fixed precision data type (\ ``ap_int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. You have a chance to further configure this more finely with per-layer configuration described below.
+  * **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA.
+  * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. <https://github.com/fastmachinelearning/models/tree/master/keras/KERAS_dense>`__
+  * **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
+  * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
+  * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision.
 
 2.2 Per-Layer Configuration
 ---------------------------
@@ -150,10 +179,10 @@ Under the ``HLSConfig`` heading, these can be set for the ``Model``\ , per ``Lay
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
 
-This configuration use ``ap_fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
+This configuration use ``fixed<16,6>`` for every variable and a ReuseFactor of 1 throughout.
 
 Specify all ``Dense`` layers to use a different precision like this:
 
@@ -161,13 +190,13 @@ Specify all ``Dense`` layers to use a different precision like this:
 
    HLSConfig:
      Model:
-       Precision: ap_fixed<16,6>
+       Precision: fixed<16,6>
        ReuseFactor: 1
      LayerType:
        Dense:
-         Precision: ap_fixed<14,5>
+         Precision: fixed<14,5>
 
-In this case, all variables in any ``Dense`` layers will be represented with ``ap_fixed<14,5>`` while any other layer types will use ``ap_fixed<16,6>``.
+In this case, all variables in any ``Dense`` layers will be represented with ``fixed<14,5>`` while any other layer types will use ``fixed<16,6>``.
 
 A specific layer can be targeted like this:
 
@@ -175,18 +204,18 @@ A specific layer can be targeted like this:
 
     HLSConfig:
        Model:
-         Precision: ap_fixed<16,6>
+         Precision: fixed<16,6>
          ReuseFactor: 16
        LayerName:
          dense1:
            Precision:
-             weight: ap_fixed<14,2>
-             bias: ap_fixed<14,4>
-             result: ap_fixed<16,6>
+             weight: fixed<14,2>
+             bias: fixed<14,4>
+             result: fixed<16,6>
            ReuseFactor: 12
            Strategy: Resource
 
-In this case, the default model configuration will use ``ap_fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
+In this case, the default model configuration will use ``fixed<16,6>`` and a ``ReuseFactor`` of 16. The layer named ``dense1`` (defined in the user provided model architecture file) will instead use different precision for the ``weight``\ , ``bias``\ , and ``result`` (output) variables, a ``ReuseFactor`` of 12, and the ``Resource`` strategy (while the model default is ``Latency`` strategy.
 
 More than one layer can have a configuration specified, e.g.:
 
@@ -203,7 +232,7 @@ More than one layer can have a configuration specified, e.g.:
        dense2:
           ...
 
-For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../concepts>` chapter.
+For more information on the optimization parameters and what they mean, you can visit the :doc:`Concepts <../api/concepts>` section.
 
 ----
 
@@ -232,7 +261,7 @@ In your project, the file ``<OutputDir>/firmware/<ProjectName>.cpp`` is your top
 
    nnet::sigmoid<layer4_t, result_t, sigmoid_config5>(layer4_out, layer5_out);
 
-You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) caluclation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
+You can see, for the simple 1-layer DNN, the computation (\ ``nnet::dense_latency``\ ) and activation (\ ``nnet::relu``\ /\ ``nnet::sigmoid``\ ) calculation for each layer.  For each layer, it has its own additional configuration parameters, e.g. ``config2``.
 
 In your project, the file ``<OutputDir>/firmware/parameters.h`` stores all the configuration options for each neural network library.
 An example is `here <https://github.com/hls-fpga-machine-learning/models/blob/master/HLS_projects/KERAS-1layer-hls/firmware/parameters.h>`__. So for example, the detailed configuration options for an example DNN layer is:
diff --git a/docs/attr_doc_gen.py b/docs/attr_doc_gen.py
new file mode 100644
index 0000000000..0ba2a5b77e
--- /dev/null
+++ b/docs/attr_doc_gen.py
@@ -0,0 +1,149 @@
+import numbers
+
+import hls4ml.backends as backends
+import hls4ml.model.attributes as attributes
+import hls4ml.model.layers as layers
+
+
+class AttrList:
+    def __init__(self, cls_name, cls_attrs) -> None:
+        self.cls_name = cls_name
+        self.config_attrs = [attr for attr in cls_attrs if attr.configurable is True]
+        self.type_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'TypeAttribute']
+        self.weight_attrs = [attr for attr in cls_attrs if attr.__class__.__name__ == 'WeightAttribute']
+        self.base_attrs = [attr for attr in cls_attrs if attr not in self.config_attrs + self.type_attrs + self.weight_attrs]
+        self.backend_attrs = {}
+        self.reverse_backend_attrs = []  # Will hold (attr, backend_name) pairs, used temporarily
+        self.unique_backend_attrs = []
+
+    def add_backend_attrs(self, backend_name, backend_attrs):
+        self.backend_attrs[backend_name] = backend_attrs
+
+        for attr in backend_attrs:
+            self.reverse_backend_attrs.append((attr, backend_name))
+
+    def sift_backend_attrs(self):
+        grouped_dict = {}
+        for attr, backend_name in self.reverse_backend_attrs:
+            if attr not in grouped_dict:
+                grouped_dict[attr] = []
+            grouped_dict[attr].append(backend_name)
+
+        for attr, backend_names in grouped_dict.items():
+            attr.available_in = backend_names
+            self.unique_backend_attrs.append(attr)
+
+    @property
+    def only_configurable(self):
+        all_attrs = self.config_attrs + self.type_attrs + self.unique_backend_attrs
+        return [attr for attr in all_attrs if attr.configurable is True]
+
+
+def convert_to_attr_list():
+    all_backends = backends.get_available_backends()
+    # Removing duplicates but preserving order
+    all_layers = list(dict.fromkeys(layers.layer_map.values()))
+    all_layers_attrs = []
+
+    for layer_cls in all_layers:
+        base_attrs = layer_cls.expected_attributes
+
+        attr_list = AttrList(layer_cls.__name__, base_attrs)
+
+        for backend_name in all_backends:
+            backend = backends.get_backend(backend_name)
+
+            backend_cls = backend.create_layer_class(layer_cls)
+            backend_attrs = backend_cls.expected_attributes
+
+            diff_atts = [
+                attr for attr in backend_attrs if attr not in base_attrs
+            ]  # Sets are faster, but don't preserve order
+            if len(diff_atts) > 0:
+                attr_list.add_backend_attrs(backend.name, diff_atts)
+
+        all_layers_attrs.append(attr_list)
+
+    for attr_list in all_layers_attrs:
+        attr_list.sift_backend_attrs()
+
+    return all_layers_attrs
+
+
+def print_attrs(attrs, file):
+    for attr in attrs:
+        if attr.value_type == numbers.Integral:
+            vtype = 'int'
+        elif attr.__class__ == attributes.ChoiceAttribute:
+            choices = ','.join([str(c) for c in attr.choices])
+            vtype = f'list [{choices}]'
+        else:
+            vtype = attr.value_type.__name__ if hasattr(attr.value_type, '__name__') else str(attr.value_type)
+
+        if attr.default is None:
+            file.write('* ' + attr.name + ': ' + vtype + '\n\n')
+        else:
+            file.write('* ' + attr.name + ': ' + vtype + ' (Default: ' + str(attr.default) + ')\n\n')
+
+        if attr.description is not None:
+            file.write('  * ' + attr.description + '\n\n')
+
+        if hasattr(attr, 'available_in'):
+            file.write('  * Available in: ' + ', '.join(attr.available_in) + '\n\n')
+
+
+def write_all_attributes(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            if len(attr_list.base_attrs) > 0:
+                file.write('Base attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.type_attrs, file)
+
+            if len(attr_list.type_attrs) > 0:
+                file.write('Type attributes\n')
+                file.write('---------------\n')
+                print_attrs(attr_list.base_attrs, file)
+
+            if len(attr_list.weight_attrs) > 0:
+                file.write('Weight attributes\n')
+                file.write('-----------------\n')
+                print_attrs(attr_list.weight_attrs, file)
+
+            if len(attr_list.config_attrs) > 0:
+                file.write('Configurable attributes\n')
+                file.write('-----------------------\n')
+                print_attrs(attr_list.config_attrs, file)
+
+            if len(attr_list.backend_attrs) > 0:
+                file.write('Backend-specific attributes\n')
+                file.write('---------------------------\n')
+                print_attrs(attr_list.unique_backend_attrs, file)
+
+
+def write_only_configurable(all_layers_attrs):
+    with open('attributes.rst', mode='w') as file:
+        file.write('================\n')
+        file.write('Layer attributes\n')
+        file.write('================\n\n\n')
+
+        for attr_list in all_layers_attrs:
+            file.write(attr_list.cls_name + '\n')
+            file.write('=' * len(attr_list.cls_name) + '\n')
+
+            config_attrs = attr_list.only_configurable
+            if len(config_attrs) > 0:
+                print_attrs(config_attrs, file)
+
+
+if __name__ == '__main__':
+    all_layers_attrs = convert_to_attr_list()
+    write_all_attributes(all_layers_attrs)
+    # write_only_configurable(all_layers_attrs)
diff --git a/docs/advanced/accelerator.rst b/docs/backend/accelerator.rst
similarity index 95%
rename from docs/advanced/accelerator.rst
rename to docs/backend/accelerator.rst
index 7a79d9dbdc..187bccaa2c 100644
--- a/docs/advanced/accelerator.rst
+++ b/docs/backend/accelerator.rst
@@ -1,8 +1,8 @@
-=========================
-VivadoAccelerator Backend
-=========================
+=================
+VivadoAccelerator
+=================
 
-The ``VivadoAccelerator`` backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
+The **VivadoAccelerator** backend of ``hls4ml`` leverages the `PYNQ <http://pynq.io/>`_ software stack to easily deploy models on supported devices.
 Currently ``hls4ml`` supports the following boards:
 
 * `pynq-z2 <https://www.xilinx.com/support/university/xup-boards/XUPPYNQ-Z2.html>`_ (part: ``xc7z020clg400-1``)
@@ -13,7 +13,7 @@ Currently ``hls4ml`` supports the following boards:
 * `alveo-u280 <https://www.xilinx.com/products/boards-and-kits/alveo/u280.html>`_ (part: ``xcu280-fsvh2892-2L-e``)
 
 but, in principle, support can be extended to `any board supported by PYNQ <http://www.pynq.io/board.html>`_.
-For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various intefaces between the two.
+For the Zynq-based boards, there are two components: an ARM-based processing system (PS) and FPGA-based programmable logic (PL), with various interfaces between the two.
 
 .. image:: ../img/zynq_interfaces.png
   :height: 300px
diff --git a/docs/backend/catapult.rst b/docs/backend/catapult.rst
new file mode 100644
index 0000000000..00cf0fb98b
--- /dev/null
+++ b/docs/backend/catapult.rst
@@ -0,0 +1,7 @@
+========
+Catapult
+========
+
+Support for Siemens Catapult HLS compiler has been added in ``hls4ml`` version 1.0.0.
+
+*TODO expand this section*
diff --git a/docs/backend/oneapi.rst b/docs/backend/oneapi.rst
new file mode 100644
index 0000000000..585bfc27cb
--- /dev/null
+++ b/docs/backend/oneapi.rst
@@ -0,0 +1,35 @@
+======
+oneAPI
+======
+
+The **oneAPI** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It will eventually
+replace the **Quartus** backend, which targeted Intel HLS. (Quartus continues to be used with IP produced by the
+**oneAPI** backend.) This section discusses details of the **oneAPI** backend.
+
+The **oneAPI** code uses SYCL kernels to implement the logic that is deployed on FPGAs. It naturally leads to the
+accelerator style of programming. In the SYCL HLS (IP Component) flow, which is currently the only flow supported, the
+kernel becomes the IP, and the "host code" becomes the testbench. An accelerator flow, with easier deployment on
+PCIe accelerator boards, is planned to be added in the future.
+
+The produced work areas use cmake to build the projects in a style based
+`oneAPI-samples <https://github.com/oneapi-src/oneAPI-samples/tree/main/DirectProgramming/C%2B%2BSYCL_FPGA>`_.
+The standard ``fpga_emu``, ``report``, ``fpga_sim``, and ``fpga`` make targets are supported. Additionally, ``make lib``
+produces the library used for calling the ``predict`` function from hls4ml. The ``compile`` and ``build`` commands
+in hls4ml interact with the cmake system, so one does not need to manually use the build system, but it there
+if desired.
+
+The **oneAPI** backend, like the **Quartus** backend, only implements the ``Resource`` strategy for the layers. There
+is no ``Latency`` implementation of any of the layers.
+
+Note:  currently tracing and external weights (i.e. setting BramFactor) are not supported.
+
+io_parallel and io_stream
+=========================
+
+As mentioned in the :ref:`I/O Types` section, ``io_parallel`` is for small models, while ``io_stream`` is for
+larger models. In ``oneAPI``, there is an additional difference: ``io_stream`` implements each layer on its
+own ``task_sequence``. Thus, the layers run in parallel, with pipes connecting the inputs and outputs. This
+is similar in style to the `dataflow` implementation on Vitis HLS, but more explicit. It is also a change
+relative to the Intel HLS-based ``Quartus`` backend. On the other hand, ``io_parallel`` always uses a single task,
+relying on pipelining within the task for good performance. In contrast, the Vitis backend sometimes uses dataflow
+with ``io_parallel``.
diff --git a/docs/backend/quartus.rst b/docs/backend/quartus.rst
new file mode 100644
index 0000000000..8cde5f97b2
--- /dev/null
+++ b/docs/backend/quartus.rst
@@ -0,0 +1,12 @@
+=======
+Quartus
+=======
+
+.. warning::
+    The **Quartus** backend is deprecated and will be removed in a future version. Users should migrate to the **oneAPI** backend.
+
+The **Quartus** backend of hls4ml is designed for deploying NNs on Intel/Altera FPGAs. It uses the discontinued Intel HLS compiler. The **oneAPI** backend should be preferred for new projects.
+The **oneAPI** backend contains the migrated the HLS code from this backend, with significantly better io_stream support, though the **oneAPI** backend does not yet support profiling, tracing,
+or the BramFactor option supported by the **Quartus** backend.  Nevertheless, little or no further development is expected for this backend.
+
+The **Quartus** backend only implements the ``Resource`` strategy for the layers. There is no ``Latency`` implementation of any of the layers.
diff --git a/docs/backend/sr.rst b/docs/backend/sr.rst
new file mode 100644
index 0000000000..93a247b63d
--- /dev/null
+++ b/docs/backend/sr.rst
@@ -0,0 +1,7 @@
+==================
+SymbolicExpression
+==================
+
+This backend can be used to implement expressions obtained through symbolic regression tools such as `PySR <https://github.com/MilesCranmer/PySR>`_ or `SymbolNet <https://github.com/hftsoi/SymbolNet>`_. The backend targets Vivado/Vitis HLS and relies on HLS math libraries provided with a licensed installation of these tools.
+
+*TODO expand this section*
diff --git a/docs/backend/vitis.rst b/docs/backend/vitis.rst
new file mode 100644
index 0000000000..9528e89a93
--- /dev/null
+++ b/docs/backend/vitis.rst
@@ -0,0 +1,11 @@
+============
+Vivado/Vitis
+============
+
+The **Vivado** and **Vitis** backends are aimed for use with AMD/Xilinx FPGAs. The **Vivado** backend targets the discontinued ``Vivado HLS`` compiler, while
+the **Vitis** backend targets the ``Vitis HLS`` compiler. Both are designed to produce IP for incorporation in ``Vivado`` designs. (See :doc:`VivadoAccelerator <accelerator>`
+for generating easily-deployable models with ``Vivado HLS``.) The ``Vitis`` accelerator flow is not directly supported, though HLS produced with the **Vitis**
+backend can be easily incorporated into Vitis kernel.
+
+Users should generally use the **Vitis** backend for new designs that target AMD/Xilinx FPGAs; new ``hls4ml`` developments will not necessarily be backported to
+the **Vivado** backend.
diff --git a/docs/concepts.rst b/docs/concepts.rst
deleted file mode 100644
index b788d5ba5d..0000000000
--- a/docs/concepts.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-========
-Concepts
-========
-
-The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
-
-The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
-greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
-
-The Inspiration
-===============
-
-The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
-While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
-However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
-As such, filters called "triggers" are used to determine whether a given event should be kept.
-Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
-
-The Solution: ``hls4ml``
-========================
-
-.. image:: img/overview.jpg
-
-
-With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
-An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
-FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
-The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
-
-How it Works
-=============
-
-.. image:: img/nn_map_paper_fig_2.png
-   :width: 70%
-   :align: center
-
-
-Consider a multilayer neural network. At each neuron in a layer :math:`m`  (containing :math:`N_m` neurons), we calculate an output value (part of the output vector :math:`\mathbf{x}_m` of said layer) using the sum of output values of the previous layer multiplied by independent weights for each of these values and a bias value. An activation function is performed on the result to get the final output value for the neuron. Representing the weights as a :math:`N_m` by :math:`N_{m-1}`  matrix  :math:`W_{m,m-1}`, the bias values as :math:`\mathbf{b}_m`, and the activation function as :math:`g_m`, we can express this compactly as:
-
-
-.. math::
-
-   \mathbf{x}_m = g_m (W_{m,m-1} \mathbf{x}_{m-1} +\mathbf{b}_m)
-
-With hls4ml, each layer of output values is calculated independently in sequence, using pipelining to speed up the process by accepting new inputs after an initiation interval.
-The activations, if nontrivial, are precomputed.
-
-To ensure optimal performance, the user can control aspects of their model, principally:
-
-
-* **Size/Compression** - Though not explicitly part of the ``hls4ml`` package, this is an important optimization to efficiently use the FPGA resources
-* **Precision** - Define the :doc:`precision <api/profiling>` of the calculations in your model
-* **Dataflow/Resource Reuse** - Control parallel or streaming model implementations with varying levels of pipelining
-* **Quantization Aware Training** - Achieve best performance at low precision with tools like QKeras, and benefit automatically during inference with ``hls4ml`` parsing of QKeras models
-
-
-.. image:: img/reuse_factor_paper_fig_8.png
-   :width: 70%
-   :align: center
-
-
-Often, these decisions will be hardware dependent to maximize performance.
-Of note is that simplifying the input network must be done before using ``hls4ml`` to generate HLS code, for optimal compression to provide a sizable speedup.
-Also important to note is the use of fixed point arithmetic in ``hls4ml``.
-This improves processing speed relative to floating point implementations.
-The ``hls4ml`` package also offers the functionality of configuring binning and output bit width of the precomputed activation functions as necessary. With respect to parallelization and resource reuse, ``hls4ml`` offers a "reuse factor" parameter that determines the number of times each multiplier is used in order to compute a layer of neuron's values. Therefore, a reuse factor of one would split the computation so each multiplier had to only perform one multiplication in the computation of the output values of a layer, as shown above. Conversely, a reuse factor of four, in this case, uses a single multiplier four times sequentially. Low reuse factor achieves the lowest latency and highest throughput but uses the most resources, while high reuse factor save resources at the expense of longer latency and lower throughput.
-The reuse factor can be set using the configuration options defined on the :doc:`Setup <setup>` page.
-
-Thereby, the ``hls4ml`` package builds efficient HLS code to implement neural networks on FPGAs for microsecond-scale latency on predictions. For more detailed information, take a look at our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/details.rst b/docs/details.rst
deleted file mode 100644
index 750833001d..0000000000
--- a/docs/details.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-================
-Software Details
-================
-
-Frontends and Backends
-----------------------
-
-In ``hls4ml`` there is a a concept of a *frontend* to parse the input NN into an internal model graph, and a *backend* that controls
-what type of output is produced from the graph. Frontends and backends can be independently chosen. Examples of frontends are the
-parsers for Keras or ONNX, and examples of backends are Vivado HLS, Intel HLS, and Vitis HLS. See :ref:`Status and Features` for the
-currently supported frontends and backends.
-
-I/O Types
----------
-
-``hls4ml`` supports multiple styles for handling data between layers, known as the ``io_type``.
-
-io_parallel
-^^^^^^^^^^^
-Data is passed in parallel between the layers. This is good for MLP networks and small CNNs. Synthesis may fail for larger networks.
-
-io_stream
-^^^^^^^^^
-Data is passed one "pixel" at a time. Each pixel is an array of channels, which are always sent in parallel. This method for sending
-data between layers is recommended for larger CNNs. For ``Dense`` layers, all the inputs are streamed in parallel as a single array.
-
-With the ``io_stream`` IO type, each layer is connected with the subsequent layer through first-in first-out (FIFO) buffers.
-The implementation of the FIFO buffers contribute to the overall resource utilization of the design, impacting in particular the BRAM or LUT utilization.
-Because the neural networks can have complex architectures generally, it is hard to know a priori the correct depth of each FIFO buffer.
-By default ``hls4ml`` choses the most conservative possible depth for each FIFO buffer, which can result in a an unnecessary overutilization of resources.
-
-In order to reduce the impact on the resources used for FIFO buffer implementation, we have a FIFO depth optimization flow. This is described
-in the :ref:`FIFO Buffer Depth Optimization` section.
diff --git a/docs/frontend/keras.rst b/docs/frontend/keras.rst
new file mode 100644
index 0000000000..d6d42cb4b8
--- /dev/null
+++ b/docs/frontend/keras.rst
@@ -0,0 +1,11 @@
+================
+Keras and QKeras
+================
+
+Keras and the quantization library QKeras are well supported in ``hls4ml``. Currently, the Keras v2 (``tf.keras``) is the preferred version, and the future versions of ``hls4ml`` will expand support for Keras v3. The frontend is based on the parsing the serialized json representation of the model.
+
+Currently, ``hls4ml`` can parse most Keras layers, including core layers, convolutional layers, pooling layers, recurrent layers, merging/reshaping layers and activation layers, implemented either via sequential or functional API. Notably missing are the attention and normalization layers. The equivalent QKeras API and quantizers are also supported. The ``Lambda`` layers don't save their state in the serialized format and are thus impossible to parse. In this case, the ``Lambda`` layers can be implemented as custom layers and parsed via the :ref:`Extension API`.
+
+The ``data_format='channels_first'`` parameter of Keras layers is supported, but not extensively tested. All HLS implementations in ``hls4ml`` are based on ``channels_last`` data format and need to be converted to that format before the HLS code can be emitted. We encourage users of ``channels_first`` to report their experiences to developers on GitHub.
+
+The development team of ``hls4ml`` is currently exploring options for QKeras alternative and will provide a drop-in replacement API compatible with Keras v3.
diff --git a/docs/frontend/pytorch.rst b/docs/frontend/pytorch.rst
new file mode 100644
index 0000000000..6e91d0c44e
--- /dev/null
+++ b/docs/frontend/pytorch.rst
@@ -0,0 +1,20 @@
+====================
+PyTorch and Brevitas
+====================
+
+The PyTorch frontend in ``hls4ml`` is implemented by parsing the symbolic trace of the ``torch.fx`` framework. This ensures the proper execution graph is captured. Therefore, only models that can be traced with the FX framework can be parsed by ``hls4ml``.
+
+Provided the underlying operation is supported in ``hls4ml``, we generally aim to support the use of both ``torch.nn`` classes and ``torch.nn.functional`` functions in the construction of PyTorch models. Generally, the use of classes is more thoroughly
+tested. Please reach out if you experience any issues with either case.
+
+The PyTorch/Brevitas parser is under heavy development and doesn't yet have the same feature set of the Keras parsers. Feel free to reach out to developers if you find a missing feature that is present in Keras parser and would like it implemented.
+
+.. note::
+    The direct ingestion of models quantized with brevitas is not supported currently. Instead, brevitas models shoud be exported in the ONNX format (see `here <https://xilinx.github.io/brevitas/tutorials/onnx_export.html>`_) and read with the ``hls4ml``
+    QONNX frontend. Issues may arise, for example when non power-of-2 or non-scalar quantization scales are used. Please reach out if you encounter any problems with this workflow.
+
+For multi-dimensional tensors, ``hls4ml`` follows the channels-last convention adopted by Keras, whereas PyTorch uses channels-first. By default, ``hls4ml`` will automaticlly transpose any tensors associated with weights and biases of the internal layers
+of the model. If the ``io_parallel`` I/O type (see :ref:`Concepts`) is used, a transpose node will be added to the model that also adjusts the input tensors. This is not available in the ``io_stream`` case and inputs must be transposed by the user.
+Outputs are not transposed back by default, but in ``io_parallel`` case, a transpose node can be added. If not needed, these adjustments can also be switched off. See :py:class:`~hls4ml.utils.config.config_from_pytorch_model` for details.
+
+The equivalent of Keras extension API is not yet available for PyTorch parser, and will be provided in the future.
diff --git a/docs/frontend/qonnx.rst b/docs/frontend/qonnx.rst
new file mode 100644
index 0000000000..09b0074a0b
--- /dev/null
+++ b/docs/frontend/qonnx.rst
@@ -0,0 +1,56 @@
+==============
+ONNX and QONNX
+==============
+
+Parsing of ONNX and QONNX models is made in conjunction with the `qonnx <https://github.com/fastmachinelearning/qonnx>`_ package, even if it no quantization is used. This is a common initial parser shared with the AMD/Xilinx FINN project. The first step is to do constant folding, shape inference, etc., on the ONNX graph, commonly known as `cleaning`.  If a model has convolution layers, the model also needs to be converted to a channels-last format, since that is what hls4ml mainly supports. The ``qonnx`` package also provides a number of additional transforms that may need to be used. For example, ``Gemm`` nodes need to converted to ``MatMul`` and ``Add`` nodes.
+
+There are command-line based versions of cleaning and channels-last conversion:
+
+.. code-block:: bash
+
+    $ qonnx_clean filename.onnx
+    $ qonnx_to_channels_last filename_clean.onnx
+    $ qonnx_clean filename_clean_channels_last.onnx  # good to do a clean again as a last step
+
+Things can similarly be done in python. This method is usually easier if you additionally need to call other transforms. An example is given below which also calls the ``GemmToMatMul`` converter:
+
+.. code-block:: python
+
+    model = ModelWrapper('filename.onnx')
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(ConvertToChannelsLastAndClean())
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+
+``ModelWrapper`` is defined in ``qonnx.core.modelwrapper``. More information on the ``qonnx`` package can be found at the `QONNX documentation page <https://qonnx.readthedocs.io/en/latest/index.html>`_.
+
+
+The next steps are very similar to if you are using a Keras model:
+
+.. code-block:: python
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend='Vitis', default_precision='fixed<16,6>'
+    )
+    # modify the config as desired
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir='my-hls-test',
+        io_type='io_stream',
+        backend='Vitis',
+        hls_config=config,
+    )
+    hls_model.compile()
+
+Note, unlike the Keras version, "name" granularity is the default for ``config_from_onnx_model``, and it must be used for QONNX models. Unquantized ONNX models can use "model" if so desired, but generally there is no benefit.
+
+One can subsequently call the ``predict`` function to check the performance or build the project.
+
+Note that ``execute_onnx`` in ``qonnx.core.onnx_exec`` can be use to run the QONNX graphs directly, and it also provides the values at intermediate layers for validating the model (tracing).
+
+Quant nodes
+===========
+
+Documentation for quant nodes is provided in the `qonnx package <https://github.com/fastmachinelearning/qonnx/tree/main/docs/qonnx-custom-ops>`_. Note that currently hls4ml only supports the `Quant operator <https://github.com/fastmachinelearning/qonnx/tree/main/docs/qonnx-custom-ops/quant_op.md>`_. Also, not all legal ``Quant`` configurations are parsable by hls4ml or synthesizable. The ``scale``, ``zeropt``, and ``bitwidth`` values must be constant (though not necessarily scalar for the ``scale`` and ``zeropt``).
+
+Generally if the ``zeropt`` is 0 and the ``scale`` is a scalar power of 2, hls4ml uses ``ap_fixed`` or ``ac_fixed`` types (depending on the backend) to represent the quantizations. In other cases, the ``scale`` and ``zeropt`` need to be explicitly handled by hls4ml, and there is more of a chance of hls4ml not being able to process the input. (Please report any issues that you find.)
diff --git a/docs/index.rst b/docs/index.rst
index c21b90aebc..ff92a3d543 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,30 +2,64 @@
     :hidden:
     :caption: Introduction
 
-    concepts
-    status
-    setup
-    release_notes
-    details
-    flows
-    command
-    reference
+    intro/introduction
+    intro/status
+    intro/setup
+    intro/faq
+    intro/release_notes
+    intro/reference
 
 .. toctree::
     :hidden:
     :glob:
-    :caption: Quick API Reference
+    :caption: User Guide
 
-    api/*
+    api/concepts
+    api/configuration
+    api/command
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Frontends
+
+    frontend/keras
+    frontend/pytorch
+    frontend/qonnx
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Backends
+
+    backend/vitis
+    backend/accelerator
+    backend/oneapi
+    backend/catapult
+    backend/quartus
+    backend/sr
 
 .. toctree::
     :hidden:
     :caption: Advanced Features
 
+    advanced/profiling
+    advanced/auto
+    advanced/hgq
     advanced/fifo_depth
     advanced/extension
-    advanced/accelerator
     advanced/model_optimization
+    advanced/bramfactor
+
+.. toctree::
+    :hidden:
+    :glob:
+    :caption: Internals
+
+    ir/ir
+    ir/modelgraph
+    ir/flows
+    ir/attributes
 
 .. toctree::
     :hidden:
@@ -59,6 +93,4 @@ For the latest status including current and planned features, see the :ref:`Stat
 
 Tutorials
 =================================
-Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found at:
-
-https://github.com/fastmachinelearning/hls4ml-tutorial
+Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`_.
diff --git a/docs/intro/faq.rst b/docs/intro/faq.rst
new file mode 100644
index 0000000000..22b4c6c99a
--- /dev/null
+++ b/docs/intro/faq.rst
@@ -0,0 +1,52 @@
+Frequently asked questions
+==========================
+
+**What is hls4ml?**
+
+``hls4ml`` is a tool for converting neural network models into FPGA firmware. hls4ml is aimed at low-latency applications, such as triggering at the Large Hadron Collider (LHC) at CERN, but is applicable to other domains requiring microsecond latency. See the full documentation for more details.
+
+**How does hls4ml work?**
+
+``hls4ml`` takes the models from Keras, PyTorch and ONNX (optionally quantized with the respective quantization libraries) and produces high-level synthesis code (based on C++) that can be converted to FPGA firmware using the HLS compilers from different vendors (AMD/Xilinx, Intel/Altera, Catapult...).
+
+**How is hls4ml so fast?**
+
+``hls4ml`` stores all weights on-chip for fast access and has tuneable parallelism. As a consequence, the size of the model that can be successfully converted into firmware with hls4ml largely depends on the amount of available resources on the target FPGA. Therefore it is highly recommended to compress the model with quantization (via QKeras or HGQ for Keras or Brevitas for PyTorch) and pruning. Additionally, ``hls4ml`` exploits the parallelism available in an FPGA or ASIC by implementing a spatial dataflow architecture.
+
+**Will my model work with hls4ml?**
+
+``hls4ml`` supports many common layers found in MLP, CNN and RNN architectures, however some seldom-used features of these layers may not be supported. Novel architectures such as graph networks or transformers are in various stages of development and are currently not stable for end-users. See the status and features page for more information. Models with custom layers can be supported through extension API. If you encounter a feature not yet supported, open a new issue.
+
+**Will my model with X parameters fit an FPGA model Y?**
+
+It depends. ``hls4ml`` has been successfully used with quantized models with `O` (10k) parameters, while for some architectures going beyond `O` (1000) parameters is not doable even on the largest FPGAs. The number of parameters of a model is generally not a good estimate of the performance on an FPGA as the computational complexity of different types of NN layers has big effects on the resource consumption on an FPGA. For example, a CNN or GNN may reuse the same parameter in many operations. Furthermore, model compression in the form of quantization and pruning can significantly change the footprint of the model on the FPGA. For these reasons, we discourage the use of this metric for estimating performance.
+
+If you're looking for a quick estimate of the resource usage and latency for a given model without synthesis, look into `rule4ml <https://github.com/IMPETUS-UdeS/rule4ml>`_ and `wa-hls4ml <https://github.com/Dendendelen/wa-hls4ml>`_ projects.
+
+LLMs and large vision transformers are not supported nor planned.
+
+**How do I get started with hls4ml?**
+
+We strongly recommend interested users unfamiliar with FPGAs or model compression techniques to review the `hls4ml tutorials <https://github.com/fastmachinelearning/hls4ml-tutorial>`_ to get an overview of the features and conversion workflow.
+
+**How do I contribute to hls4ml development?**
+
+We're always welcoming new contributions. If you have an interesting feature in mind feel free to start a new discussion thread with your proposal. We also have regular meetings online to discuss the status of developments where you can be invited to present your work. To receive announcements, `request to be added to our CERN e-group <https://e-groups.cern.ch/e-groups/Egroup.do?egroupName=hls-fml>`_. Furthermore, check the `CONTRIBUTING <https://github.com/fastmachinelearning/hls4ml/blob/main/CONTRIBUTING.md>`_ document for a set of technical requirements for making contributions to the hls4ml project.
+
+
+Common HLS synthesis issues
+***************************
+
+**Stop unrolling loop ... because it may cause large runtime and excessive memory usage due to increase in code size.**
+
+This error is common with models that are too large to fit on the FPGA given the ``IOType`` used. If you are using ``io_parallel``, consider switching to ``io_stream``, which prevents unrolling all arrays. It may help to also use the ``Resource`` strategy. Pruning or quantizing the model may not help as it is related to the size of the loops. If possible, try to reduce the number of neurons/filters of your model to reduce the size of the activation tensors and thus number of iterations of loops.
+
+**cannot open shared object file ...: No such file or directory.**
+
+This is usually an indication that the compilation failed due to incorrect HLS code being produced. It is most likely a bug in hls4ml. Please open a bug report. Note that the displayed error message may be the same but the cause can be different. Unless you're sure that the existing bug reports show the same underlying issue, it is better to open a separate bug report.
+
+**My hls4ml predictions don't match the original Keras/PyTorch/ONNX ones**
+
+``hls4ml`` uses fixed-point precision types to represent internal data structures, unlike the floating-point precision types used for computation in upstream ML toolkits. If the used bit width is not sufficiently wide, you may encounter issues with computation accuracy that propagates through the layers. This is especially true for models that are not fully quantized, or models with insufficient ``accum_t`` bitwidth. Look into automatic precision inference and profiling tools to resolve the issue.
+
+Note that bit-exact behavior is not always possible, as many math functions (used by activation functions) are approximated with lookup tables.
diff --git a/docs/intro/introduction.rst b/docs/intro/introduction.rst
new file mode 100644
index 0000000000..8d603bd78f
--- /dev/null
+++ b/docs/intro/introduction.rst
@@ -0,0 +1,30 @@
+============
+Introduction
+============
+
+The goal of ``hls4ml`` is to provide an efficient and fast translation of machine learning models from open-source packages (like Keras and PyTorch) for training machine learning algorithms to high level synthesis (HLS) code that can then be transpiled to run on an FPGA. The resulting HLS project can be then used to produce an IP which can be plugged into more complex designs or be used to create a kernel for CPU co-processing. The user has freedom to define many of the parameters of their algorithm to best suit their needs.
+
+The ``hls4ml`` package enables fast prototyping of a machine learning algorithm implementation in FPGAs,
+greatly reducing the time to results and giving the user intuition for how to best design a machine learning algorithm for their application while balancing performance, resource utilization and latency requirements.
+
+The Inspiration
+===============
+
+The inspiration for the creation of the ``hls4ml`` package stems from the high energy physics community at the CERN Large Hadron Collider (LHC).
+While machine learning has already been proven to be extremely useful in analysis of data from detectors at the LHC, it is typically performed in an "offline" environment after the data is taken and agglomerated.
+However, one of the largest problems at detectors on the LHC is that collisions, or "events", generate too much data for everything to be saved.
+As such, filters called "triggers" are used to determine whether a given event should be kept.
+Using FPGAs allows for significantly lower latency so machine learning algorithms can essentially be run "live" at the detector level for event selection. As a result, more events with potential signs of new physics can be preserved for analysis.
+
+The Solution: ``hls4ml``
+========================
+
+.. image:: ../img/overview.jpg
+
+
+With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
+An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
+FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
+The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
+
+For more detailed information on technical details of ``hls4ml``, read the "Internals" section of our documentation or our :doc:`References <reference>` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_.
diff --git a/docs/reference.rst b/docs/intro/reference.rst
similarity index 99%
rename from docs/reference.rst
rename to docs/intro/reference.rst
index f271679620..0bd5912bb1 100644
--- a/docs/reference.rst
+++ b/docs/intro/reference.rst
@@ -12,9 +12,9 @@ If you use this software in a publication, please cite the software
     @software{fastml_hls4ml,
     author       = {{FastML Team}},
     title        = {fastmachinelearning/hls4ml},
-    year         = 2023,
+    year         = 2024,
     publisher    = {Zenodo},
-    version      = {v0.8.1},
+    version      = {v1.0.0},
     doi          = {10.5281/zenodo.1201549},
     url          = {https://github.com/fastmachinelearning/hls4ml}
     }
diff --git a/docs/release_notes.rst b/docs/intro/release_notes.rst
similarity index 100%
rename from docs/release_notes.rst
rename to docs/intro/release_notes.rst
diff --git a/docs/setup.rst b/docs/intro/setup.rst
similarity index 50%
rename from docs/setup.rst
rename to docs/intro/setup.rst
index a735281c3f..6ba0c4ce0e 100644
--- a/docs/setup.rst
+++ b/docs/intro/setup.rst
@@ -14,7 +14,7 @@ The latest release of ``hls4ml`` can be installed with ``pip``:
 
    pip install hls4ml
 
-If you want to use our :doc:`profiling <api/profiling>` toolbox, you might need to install extra dependencies:
+If you want to use our :doc:`profiling <../advanced/profiling>` toolbox, you might need to install extra dependencies:
 
 .. code-block::
 
@@ -43,29 +43,36 @@ version can be installed directly from ``git``:
 Dependencies
 ============
 
-The ``hls4ml`` library depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
+The ``hls4ml`` library requires python 3.10 or later, and depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed
 by ``pip`` or ``conda``.
 
-* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.4 and newer) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter.
+* `TensorFlow <https://pypi.org/project/tensorflow/>`_ (version 2.8 to 2.14) and `QKeras <https://pypi.org/project/qkeras/>`_ are required by the Keras converter. One may want to install newer versions of QKeras from GitHub. Newer versions of TensorFlow can be used, but QKeras and hl4ml do not currently support Keras v3.
+
 * `ONNX <https://pypi.org/project/onnx/>`_ (version 1.4.0 and newer) is required by the ONNX converter.
+
 * `PyTorch <https://pytorch.org/get-started>`_ package is optional. If not installed, the PyTorch converter will not be available.
 
 Running C simulation from Python requires a C++11-compatible compiler. On Linux, a GCC C++ compiler ``g++`` is required. Any version from a recent
-Linux should work. On MacOS, the *clang*-based ``g++`` is enough.
+Linux should work. On MacOS, the *clang*-based ``g++`` is enough. For the oneAPI backend, one must have oneAPI installed, along with the FPGA compiler,
+to run C/SYCL simulations.
 
 To run FPGA synthesis, installation of following tools is required:
 
-* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs
+* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs using the ``Vivado`` backend.
+
+* Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
 
-  * Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
+* Intel Quartus 20.1 to 21.4 for the synthesis for Intel/Altera FPGAs using the ``Quartus`` backend.
 
-* Intel Quartus 20.1 to 21.4 for the synthesis for Intel FPGAs
+* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altera Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
+
+Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPGAs.
 
 
 Quick Start
 =============
 
-For basic concepts to understand the tool, please visit the :doc:`Concepts <concepts>` chapter.
+For basic concepts to understand the tool, please visit the :doc:`Concepts <../api/concepts>` chapter.
 Here we give line-by-line instructions to demonstrate the general workflow.
 
 .. code-block:: python
@@ -98,78 +105,79 @@ After that, you can use :code:`Vivado HLS` to synthesize the model:
 
 Done! You've built your first project using ``hls4ml``! To learn more about our various API functionalities, check out our tutorials `here <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
 
-If you want to configure your model further, check out our :doc:`Configuration <api/configuration>` page.
+If you want to configure your model further, check out our :doc:`Configuration <../api/configuration>` page.
 
-Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
+..
+   Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
 
-Getting started with hls4ml CLI (deprecated)
---------------------------------------------
+   Getting started with hls4ml CLI (deprecated)
+   --------------------------------------------
 
-As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
+   As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
 
-To follow this tutorial, you must first download our ``example-models`` repository:
+   To follow this tutorial, you must first download our ``example-models`` repository:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone https://github.com/fastmachinelearning/example-models
+      git clone https://github.com/fastmachinelearning/example-models
 
-Alternatively, you can clone the ``hls4ml`` repository with submodules
+   Alternatively, you can clone the ``hls4ml`` repository with submodules
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
+      git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
 
-The model files, along with other configuration parameters, are defined in the ``.yml`` files.
-Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
+   The model files, along with other configuration parameters, are defined in the ``.yml`` files.
+   Further information about ``.yml`` files can be found in :doc:`Configuration <api/configuration>` page.
 
-In order to create an example HLS project, first go to ``example-models/`` from the main directory:
+   In order to create an example HLS project, first go to ``example-models/`` from the main directory:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   cd example-models/
+      cd example-models/
 
-And use this command to translate a Keras model:
+   And use this command to translate a Keras model:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml convert -c keras-config.yml
+      hls4ml convert -c keras-config.yml
 
-This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
-To build the HLS project, do:
+   This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
+   To build the HLS project, do:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   hls4ml build -p my-hls-test -a
+      hls4ml build -p my-hls-test -a
 
-This will create a Vivado HLS project with your model implementation!
+   This will create a Vivado HLS project with your model implementation!
 
-**NOTE:** For the last step, you can alternatively do the following to build the HLS project:
+   **NOTE:** For the last step, you can alternatively do the following to build the HLS project:
 
-.. code-block:: Bash
+   .. code-block:: Bash
 
-   cd my-hls-test
-   vivado_hls -f build_prj.tcl
+      cd my-hls-test
+      vivado_hls -f build_prj.tcl
 
-``vivado_hls`` can be controlled with:
+   ``vivado_hls`` can be controlled with:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-   vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
+      vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
 
-Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
+   Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
 
-Further help
-^^^^^^^^^^^^
+   Further help
+   ^^^^^^^^^^^^
 
-* For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
-* If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
-* We provide a detailed documentation for each of the command in the :doc:`Command Help <../command>` section
+   * For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
+   * If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
+   * We provide a detailed documentation for each of the command in the :doc:`Command Help <advanced/command>` section
 
 Existing examples
 -----------------
 
-* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 * Training codes and examples of resources needed to train the models can be found in the `tutorial <https://github.com/fastmachinelearning/hls4ml-tutorial>`__.
+* Examples of model files and weights can be found in `example_models <https://github.com/fastmachinelearning/example-models>`_ directory.
 
 Uninstalling
 ------------
diff --git a/docs/status.rst b/docs/intro/status.rst
similarity index 81%
rename from docs/status.rst
rename to docs/intro/status.rst
index 4ff4d33282..5d3f3591f2 100644
--- a/docs/status.rst
+++ b/docs/intro/status.rst
@@ -18,8 +18,8 @@ A list of supported ML frameworks, HLS backends, and neural network architecture
 ML framework support:
 
 * (Q)Keras
-* PyTorch (limited)
-* (Q)ONNX (in development)
+* PyTorch
+* (Q)ONNX
 
 Neural network architectures:
 
@@ -32,7 +32,9 @@ HLS backends:
 
 * Vivado HLS
 * Intel HLS
-* Vitis HLS (experimental)
+* Vitis HLS
+* Catapult HLS
+* oneAPI (experimental)
 
 A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
@@ -46,35 +48,44 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - Vivado HLS
      - Intel HLS
      - Vitis HLS
+     - Catapult HLS
+     - oneAPI
    * - MLP
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - CNN
      - ``supported``
-     - ``limited``
-     - ``in development``
+     - ``supported``
+     - ``supported``
+     - ``supported``
+     - ``supported``
      - ``supported``
      - ``supported``
      - ``experimental``
    * - RNN (LSTM)
+     - ``supported``
      - ``supported``
      - ``N/A``
-     - ``in development``
      - ``supported``
      - ``supported``
-     - ``N/A``
+     - ``supported``
+     - ``supported``
+     - ``experimental``
    * - GNN (GarNet)
      - ``supported``
+     - ``in development``
+     - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
      - ``N/A``
-
 
 Other feature notes:
 
@@ -82,6 +93,9 @@ Other feature notes:
    * Vivado HLS versions 2018.2 to 2020.1
    * Intel HLS versions 20.1 to 21.4
    * Vitis HLS versions 2022.2 to 2024.1
+   * Catapult HLS versions 2024.1_1 to 2024.2
+   * oneAPI versions 2024.1 to 2025.0
+
 * Windows and macOS are not supported
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 
diff --git a/docs/ir/attributes.rst b/docs/ir/attributes.rst
new file mode 100644
index 0000000000..dfbec51b1c
--- /dev/null
+++ b/docs/ir/attributes.rst
@@ -0,0 +1,2802 @@
+================
+Layer attributes
+================
+
+
+Input
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Constant
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* value: ndarray
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Activation
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ParametrizedActivation
+======================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+PReLU
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* n_in: int
+
+* activation: str
+
+Weight attributes
+-----------------
+* param: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* param_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Softmax
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* implementation: list [latency,stable,argmax,legacy] (Default: stable)
+
+  * Choice of implementation of softmax function. "latency" provides good latency at the expense of extra resources. performs well on small number of classes. "stable" may require extra clock cycles but has better accuracy. "legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. It is superseded by the "latency" implementation for most applications. "argmax" is a special implementation that can be used if only the output with the highest probability is important. Using this implementation will save resources and clock cycles.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* skip: bool (Default: False)
+
+  * If enabled, skips the softmax node and returns the raw outputs.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* exp_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* inv_table_t: NamedType (Default: fixed<18,8,RND,SAT,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+TernaryTanh
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+HardActivation
+==============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* activation: str
+
+* slope: float (Default: 0.2)
+
+* shift: float (Default: 0.5)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* slope_t: NamedType
+
+* shift_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Reshape
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* target_shape: Sequence
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Dense
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Conv1D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2D
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Conv2DBatchnorm
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+SeparableConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* depth_multiplier: int (Default: 1)
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* depthwise: WeightVariable
+
+* pointwise: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* depthwise_t: NamedType
+
+* pointwise_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* depthwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_accum_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_result_t: NamedType
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* depthwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* pointwise_reuse_factor: int (Default: 1)
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* dw_output_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * Available in: Catapult
+
+DepthwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* depth_multiplier: int (Default: 1)
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+BatchNormalization
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Pooling1D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* n_filt: int
+
+* pool_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Pooling2D
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_filt: int
+
+* pool_height: int
+
+* pool_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+* count_pad: bool (Default: False)
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+GlobalPooling1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+GlobalPooling2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* n_filt: int
+
+* pool_op: list [Max,Average]
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ZeroPadding1D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+ZeroPadding2D
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Merge
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+MatMul
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Dot
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Concatenate
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+Resize
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* align_corners: bool (Default: False)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* algorithm: list [nearest,bilinear] (Default: nearest)
+
+Transpose
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Embedding
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_out: int
+
+* vocab_size: int
+
+Weight attributes
+-----------------
+* embeddings: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* embeddings_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+SimpleRNN
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+LSTM
+====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GRU
+===
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_out: int
+
+* activation: str
+
+* recurrent_activation: str
+
+* return_sequences: bool (Default: False)
+
+* return_state: bool (Default: False)
+
+* time_major: bool (Default: False)
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+* recurrent_weight: WeightVariable
+
+* recurrent_bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* direction: list [forward,backward] (Default: forward)
+
+* apply_reset_gate: list [before,after] (Default: after)
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+* recurrent_weight_t: NamedType
+
+* recurrent_bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* recurrent_reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* static: bool (Default: True)
+
+  * If set to True, will reuse the the same recurrent block for computation, resulting in lower resource usage at the expense of serialized computation and higher latency/II.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+* table_size: int (Default: 1024)
+
+  * The size of the lookup table used to approximate the function.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+* table_t: NamedType (Default: fixed<18,8,TRN,WRAP,0>)
+
+  * The datatype (precision) used for the values of the lookup table.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, oneAPI
+
+GarNet
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+GarNetStack
+===========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, Vivado, VivadoAccelerator, VivadoAccelerator, Vitis, Vitis, Quartus, Quartus, Catapult, Catapult, SymbolicExpression, SymbolicExpression, oneAPI, oneAPI
+
+Quant
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* narrow: bool
+
+* rounding_mode: str
+
+* signed: bool
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+ApplyAlpha
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: -1)
+
+* use_gamma: bool (Default: True)
+
+* use_beta: bool (Default: True)
+
+Weight attributes
+-----------------
+* scale: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* scale_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+BatchNormOnnx
+=============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+LayerGroup
+==========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* layer_list: list
+
+* input_layers: list
+
+* output_layers: list
+
+* data_reader: object
+
+* output_shape: list
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+SymbolicExpression
+==================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* expression: list
+
+* n_symbols: int
+
+* lut_functions: list (Default: [])
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BiasAdd
+=======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Backend-specific attributes
+---------------------------
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+FixedPointQuantizer
+===================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+UnaryLUT
+========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Repack
+======
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Clone
+=====
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+BatchNormalizationQuantizedTanh
+===============================
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* n_in: int
+
+* n_filt: int (Default: 0)
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* accum_t: NamedType
+
+* reuse_factor: int (Default: 1)
+
+PointwiseConv1D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_width: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_width: int
+
+* stride_width: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+PointwiseConv2D
+===============
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+* in_height: int
+
+* in_width: int
+
+* out_height: int
+
+* out_width: int
+
+* n_chan: int
+
+* n_filt: int
+
+* filt_height: int
+
+* filt_width: int
+
+* stride_height: int
+
+* stride_width: int
+
+* pad_top: int
+
+* pad_bottom: int
+
+* pad_left: int
+
+* pad_right: int
+
+Weight attributes
+-----------------
+* weight: WeightVariable
+
+* bias: WeightVariable
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+* weight_t: NamedType
+
+* bias_t: NamedType
+
+Backend-specific attributes
+---------------------------
+* accum_t: NamedType
+
+  * The datatype (precision) used to store intermediate results of the computation within the layer.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* reuse_factor: int (Default: 1)
+
+  * The number of times each multiplier is used by controlling the amount of pipelining/unrolling. Lower number results in more parallelism and lower latency at the expense of the resources used.Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Quartus, Catapult, SymbolicExpression, oneAPI
+
+* parallelization_factor: int (Default: 1)
+
+  * The number of outputs computed in parallel. Essentially the number of multiplications of input window with the convolution kernel occuring in parallel. Higher number results in more parallelism (lower latency and II) at the expense of resources used.Currently only supported in io_parallel.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult, oneAPI
+
+* conv_implementation: list [LineBuffer,Encoded] (Default: LineBuffer)
+
+  * "LineBuffer" implementation is preferred over "Encoded" for most use cases. This attribute only applies to io_stream.
+
+  * Available in: Vivado, VivadoAccelerator, Vitis, Catapult
+
+Broadcast
+=========
+Base attributes
+---------------
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
+
+Type attributes
+---------------
+* index: int
+
+  * Internal node counter used for bookkeeping and variable/tensor naming.
+
+Configurable attributes
+-----------------------
+* trace: int (Default: False)
+
+  * Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)
+
+* result_t: NamedType
+
+  * The datatype (precision) of the output tensor.
diff --git a/docs/flows.rst b/docs/ir/flows.rst
similarity index 84%
rename from docs/flows.rst
rename to docs/ir/flows.rst
index 37b8b44ff9..dbdef58896 100644
--- a/docs/flows.rst
+++ b/docs/ir/flows.rst
@@ -2,17 +2,6 @@
 Optimizer Passes and Flows
 ==========================
 
-Internal Structure
-------------------
-
-The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
-:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, corresponding to the layer and operations of the input model are represented
-by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
-
-Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
-about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
-Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak.
-
 Optimizer passes
 ----------------
 
diff --git a/docs/ir/ir.rst b/docs/ir/ir.rst
new file mode 100644
index 0000000000..18b0a1c679
--- /dev/null
+++ b/docs/ir/ir.rst
@@ -0,0 +1,90 @@
+=======================
+Internal representation
+=======================
+
+The ``hls4ml`` library will parse models from Keras, PyTorch or ONNX into an internal execution graph. This model graph is represented with the
+:py:class:`~hls4ml.model.graph.ModelGraph` class. The nodes in this graph, loosely corresponding to the layers and operations of the input model are represented
+by classes derived from the :py:class:`~hls4ml.model.layers.Layer` base class.
+
+Layers are required to have defined inputs and outputs that define how they are connected in the graph and what is the shape of their output. All information
+about the layer's state and configuration is stored in its attributes. All weights, variables and data types are attributes and there are mapping views to sort through them.
+Layers can define expected attributes and can be verified for correctness, or to produce a list of configurable attributes that user can tweak. The complete list of attributes can be found in the :doc:`Attributes <attributes>` page.
+
+
+Layers
+======
+
+The backends of ``hls4ml`` are independent from each other and free to implement features in any suitable way, most implementations share common concepts which we will mention here.
+
+Dense Layers
+------------
+
+One-dimensional Dense Layers
+****************************
+
+Dense layers over one-dimensional data perform a matrix-vector multiplication followed by elementwise addition of bias tensor. This routine is the underlying computation of many other layers as well and is reused as much as possible. It exists in several implementations across different backends, for different `io_type`'s and strategies.
+
+io_parallel
+^^^^^^^^^^^
+
+All the backends have a ``Resource`` implementation, which divides the computation into a loop of ``reuse_factor`` iterations, each iteration simultaneously accessing a different part of the array partitioned in BRAM. There are different implementations depending on whether the reuse factor is smaller or bigger than the input size. The two Xilinx backends and Catapult also implement a ``Latency`` implementation, which uses the reuse factor to control the amount of pipelining/unrolling of the whole function while the weight array is fully partitioned in registers.
+
+io_stream
+^^^^^^^^^
+
+The io_stream implementation only wraps the io_parallel implementation with streams or pipes for communication. Internally, data is still accessed in parallel as an array.
+
+Multi-dimensional Dense Layers
+******************************
+
+Multi-dimensional Dense layers are converted to pointwise convolutions, and do not directly use the above implementation.
+
+
+Convolution Layers
+------------------
+
+Standard convolution
+********************
+
+By *standard* convolution we refer to the operation represented by the ``Conv1D/2D`` layer in Keras (``Conv1d/2d`` in PyTorch). Depending on the ``io_type`` option used, there are two classes of implementations in ``hls4ml``.
+
+io_parallel
+^^^^^^^^^^^
+
+Parallel IO is applicable to small models that require low latency implementation. Larger models face synthesizability limits very quickly.
+
+In Vivado/Vitis backends, parallel convolution relies on the *im2col* transformation of the input, which turns convolution into a matrix-multiplication task. This task is then implemented as a sequence of matrix-vector multiplications using the routine mentioned above. The ``Latency`` and ``Resource`` strategies refer to the function used for matrix-vector multiplication routine, with ``Resource`` allowing for a slightly larger models to be synthesized. Parallelism can be further controlled via the ``ParallelizationFactor``. Catapult backend in turn uses a direct implementation of convolution via nested loops. The ``Quartus``, ``oneAPI``, and ``Catapult`` backends also implement a ``Winograd`` algorithm choosable by setting the ``implementation`` to ``Winograd`` or ``combination``. Winograd implementation is available for only a handful of filter size configurations, and it is less concerned about bit accuracy and overflow. In certain conditions it can be faster.
+
+io_stream
+^^^^^^^^^
+
+There are two main classes of io_stream implementations, ``LineBuffer`` and  ``Encoded``. ``LineBuffer`` is the default, and generally produces marginally better results,
+while ``Catapult`` and ``Vivado`` also implement ``Encoded``, choosable with the ``ConvImplementation`` configuration option. In all cases, the data is processed serially, one pixel at a time, with a pixel containing an array of all the channel values for the pixel.
+
+Depthwise convolution
+*********************
+
+Depthwise implementation substitutes the matrix-vector multiplication in the kernel to the elementwise multiplication. The only implementation available is based on ``Latency`` strategy, used by both ``io_parallel`` and ``io_stream``.
+
+Pointwise convolution
+*********************
+
+Pointwise convolutions are a special case of convolution where the filter size is ``1`` for 1D or ``1x1`` for 2D.
+
+For the Vivado/Vitis backends, there is a dedicated ``io_parallel``/``Latency`` strategy implementation of 1D pointwise convolutional layers originally developed for `arXiv:2402.01876 <https://arxiv.org/abs/2402.01876>`_.
+The reuse factor (RF) is used to split the layer execution and reuse the existing module RF times. The RF also limits the number of multipliers in each module.
+The initiation interval scales as the RF. One limitation is that it assumes ``in_width`` is divisible by the RF.
+
+Activations
+-----------
+
+Most activations without extra parameters are represented with the ``Activation`` layer, and those with single parameters (leaky ReLU, thresholded ReLU, ELU) as ``ParametrizedActivation``. ``PReLU`` has its own class because it has a parameter matrix (stored as a weight). The hard (piecewise linear) sigmoid and tanh functions are implemented in a ``HardActivation`` layer, and ``Softmax`` has its own layer class.
+
+Backends have four softmax implementations that the user can choose from by setting the ``implementation`` parameter:
+
+* **latency**:  Good latency, but somewhat high resource usage. It does not work well if there are many output classes.
+* **stable**:  Slower but with better accuracy, useful in scenarios where higher accuracy is needed.
+* **legacy**:  An older implementation with poor accuracy, but good performance. Usually the latency implementation is preferred.
+* **argmax**:  If you don't care about normalized outputs and only care about which one has the highest value, using argmax saves a lot of resources. This sets the highest value to 1, the others to 0.
+
+Vivado/Vitis backend additionally support completely skipping softmax activation and returning raw outputs.
diff --git a/docs/api/hls-model.rst b/docs/ir/modelgraph.rst
similarity index 58%
rename from docs/api/hls-model.rst
rename to docs/ir/modelgraph.rst
index bf0d8ee3ce..048e67e101 100644
--- a/docs/api/hls-model.rst
+++ b/docs/ir/modelgraph.rst
@@ -1,8 +1,8 @@
 ================
-HLS Model Class
+ModelGraph Class
 ================
 
-This page documents our hls_model class usage. You can generate generate an hls model object from a keras model through ``hls4ml``'s API:
+This page documents our ``ModelGraph`` class usage. You can generate generate an instance of this class through ``hls4ml``'s API, for example by converting a Keras model:
 
 .. code-block:: python
 
@@ -11,10 +11,10 @@ This page documents our hls_model class usage. You can generate generate an hls
    # Generate a simple configuration from keras model
    config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name')
 
-   # Convert to an hls model
+   # Convert to a ModelGraph instance (hls_model)
    hls_model = hls4ml.converters.convert_from_keras_model(keras_model, hls_config=config, output_dir='test_prj')
 
-After that, you can use several methods in that object. Here is a list of all the methods:
+This object can be used to perform common simulation and firmware-generation tasks. Here is a list of important user-facing methods:
 
 
 * :ref:`write <write-method>`
@@ -23,8 +23,6 @@ After that, you can use several methods in that object. Here is a list of all th
 * :ref:`build <build-method>`
 * :ref:`trace <trace-method>`
 
-Similar functionalities are also supported through command line interface. If you prefer using them, please refer to Command Help section.
-
 ----
 
 .. _write-method:
@@ -32,7 +30,7 @@ Similar functionalities are also supported through command line interface. If yo
 ``write`` method
 ====================
 
-Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
+Write the ``ModelGraph`` to the output directory specified in the config:
 
 .. code-block:: python
 
@@ -45,7 +43,7 @@ Write your keras model as a hls project to ``hls_model``\ 's ``output_dir``\ :
 ``compile`` method
 ======================
 
-Compile your hls project.
+Compiles the written C++/HLS code and links it into the Python runtime. Compiled model can be used to evaluate performance (accuracy) through ``predict()`` method.
 
 .. code-block:: python
 
@@ -58,7 +56,7 @@ Compile your hls project.
 ``predict`` method
 ======================
 
-Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model`` just by supplying an input ``numpy`` array:
+Similar to ``keras``\ 's predict API, you can get the predictions just by supplying an input ``numpy`` array:
 
 .. code-block:: python
 
@@ -67,7 +65,7 @@ Similar to ``keras``\ 's predict API, you can get the predictions of ``hls_model
 
    y = hls_model.predict(X)
 
-This is similar to doing ``csim`` simulation, but you can get your prediction results much faster. It's very helpful when you want to quickly prototype different configurations for your model.
+This is similar to doing ``csim`` simulation, without creating the testbench and supplying data. It's very helpful when you want to quickly prototype different configurations for your model.
 
 ----
 
@@ -76,13 +74,17 @@ This is similar to doing ``csim`` simulation, but you can get your prediction re
 ``build`` method
 ====================
 
+This method "builds" the generated HLS project. The parameters of build are backend-specific and usually include simulation and synthesis. Refer to each backend for a complete list of supported parameters to ``build()``.
+
 .. code-block:: python
 
-   hls_model.build()
+   report = hls_model.build()
 
    #You can also read the report of the build
    hls4ml.report.read_vivado_report('hls4ml_prj')
 
+The returned ``report`` object will contain the result of build step, which may include C-simulation results, HLS synthesis estimates, co-simulation latency etc, depending on the backend used.
+
 ----
 
 .. _trace-method:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 66aa579ea6..fe3c4f2544 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,5 +4,4 @@ sphinx>=3.2.1
 sphinx_contributors
 sphinx_github_changelog
 sphinx_rtd_theme
-tensorflow<=2.15
 toposort>=1.5.0
diff --git a/example-models b/example-models
index 3cfbcfd062..c6bb3c0686 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit 3cfbcfd062f60492507d21ff0e91559b3bdd6550
+Subproject commit c6bb3c0686d52439d8c53d7407903bf78e852562
diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py
index 81b2859551..e3a7247b0d 100644
--- a/hls4ml/__init__.py
+++ b/hls4ml/__init__.py
@@ -1,4 +1,34 @@
-from hls4ml import converters, report, utils  # noqa: F401
+# Temporary workaround for QKeras installation requirement, will be removed after 1.0.0
+def maybe_install_qkeras():
+    import subprocess
+    import sys
+
+    QKERAS_PKG_NAME = 'QKeras'
+    # QKERAS_PKG_SOURCE = QKERAS_PKG_NAME
+    QKERAS_PKG_SOURCE = 'qkeras@git+https://github.com/fastmachinelearning/qkeras.git'
+
+    def pip_list():
+        p = subprocess.run([sys.executable, '-m', 'pip', 'list'], check=True, capture_output=True)
+        return p.stdout.decode()
+
+    def pip_install(package):
+        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+
+    all_pkgs = pip_list()
+    if QKERAS_PKG_NAME not in all_pkgs:
+        print('QKeras installation not found, installing one...')
+        pip_install(QKERAS_PKG_SOURCE)
+        print('QKeras installed.')
+
+
+try:
+    maybe_install_qkeras()
+except Exception:
+    print('Could not find QKeras installation, make sure you have QKeras installed.')
+
+# End of workaround
+
+from hls4ml import converters, report, utils  # noqa: F401, E402
 
 try:
     from ._version import version as __version__
diff --git a/hls4ml/backends/catapult/catapult_backend.py b/hls4ml/backends/catapult/catapult_backend.py
index 5c85bf9b7e..030016d6cd 100644
--- a/hls4ml/backends/catapult/catapult_backend.py
+++ b/hls4ml/backends/catapult/catapult_backend.py
@@ -32,6 +32,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_catapult_report
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.fixed_point_utils import ceil_log2
 
 
@@ -51,10 +52,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -65,7 +68,7 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
@@ -73,8 +76,14 @@ def _register_layer_attributes(self):
 
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
         sep_conv_layers = [SeparableConv1D, SeparableConv2D]
@@ -88,6 +97,7 @@ def _register_flows(self):
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
         streaming_passes = [
+            'catapult:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
             'catapult:reshape_stream',
             'catapult:clone_output',
             'catapult:insert_zero_padding_before_conv1d',
diff --git a/hls4ml/backends/catapult/passes/merge_templates.py b/hls4ml/backends/catapult/passes/merge_templates.py
index ff6928679c..b6548c5112 100755
--- a/hls4ml/backends/catapult/passes/merge_templates.py
+++ b/hls4ml/backends/catapult/passes/merge_templates.py
@@ -6,6 +6,7 @@
 
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py
index 0141d7f108..fd464ef172 100755
--- a/hls4ml/backends/catapult/passes/pointwise.py
+++ b/hls4ml/backends/catapult/passes/pointwise.py
@@ -1,5 +1,3 @@
-from copy import copy
-
 from hls4ml.backends.catapult.passes.convolution_templates import (
     Conv1DConfigTemplate,
     Conv1DFunctionTemplate,
@@ -75,8 +73,10 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy())
-        pw_node.weights['bias'].data = node.weights['bias'].data
+        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        pw_node = model.make_node(
+            'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
+        )
         # Set strategy to ensure lowercase string is passed to the template
         if model.config.is_resource_strategy(pw_node):
             pw_node.set_attr('strategy', 'resource')
diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 7996adfd00..b20fdf1228 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -13,6 +13,8 @@
     LSTM,
     Activation,
     BatchNormalization,
+    BatchNormOnnx,
+    Conv,
     Conv1D,
     Conv2D,
     Dense,
@@ -22,8 +24,11 @@
     GarNetStack,
     GlobalPooling1D,
     GlobalPooling2D,
+    MatMul,
+    Merge,
     Pooling1D,
     Pooling2D,
+    Quant,
     SeparableConv1D,
     SeparableConv2D,
     SimpleRNN,
@@ -40,6 +45,7 @@
     UnspecifiedPrecisionType,
     XnorPrecisionType,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.writer import get_writer
 
 
@@ -63,21 +69,32 @@ def __init__(self, name):
             LSTM,
             GRU,
             Dot,
+            Conv,
+            MatMul,
         ]
 
         for layer in accum_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(TypeAttribute('accum'))
+            attrs.append(TypeAttribute('accum', description=descriptions.accum_type))
             self.attribute_map[layer] = attrs
 
-        rf_layers = accum_layers + [BatchNormalization, Activation, Embedding, GarNet, GarNetStack]
+        rf_layers = accum_layers + [
+            BatchNormalization,
+            Activation,
+            Embedding,
+            GarNet,
+            GarNetStack,
+            Quant,
+            BatchNormOnnx,
+            Merge,
+        ]
 
         for layer in rf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('reuse_factor', default=1))
+            attrs.append(ConfigurableAttribute('reuse_factor', default=1, description=descriptions.reuse_factor))
             self.attribute_map[layer] = attrs
 
-        # seperable is kind of special because it is effectively two layers that will be split
+        # separable is kind of special because it is effectively two layers that will be split
         for layer in (SeparableConv1D, SeparableConv2D):
             attrs = self.attribute_map.get(layer, [])
             attrs.append(TypeAttribute('depthwise_accum'))
@@ -88,23 +105,34 @@ def __init__(self, name):
             self.attribute_map[layer] = attrs
 
         act_attrs = self.attribute_map.get(Activation, [])
-        act_attrs.append(ConfigurableAttribute('table_size', default=1024))
-        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+        act_attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+        act_attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
         self.attribute_map[Activation] = act_attrs
 
         softmax_attrs = self.attribute_map.get(Softmax, [])
-        softmax_attrs.append(ChoiceAttribute('implementation', ['latency', 'stable', 'argmax', 'legacy'], default='stable'))
-        softmax_attrs.append(ConfigurableAttribute('skip', value_type=bool, default=False))
+        softmax_attrs.append(
+            ChoiceAttribute(
+                'implementation',
+                ['latency', 'stable', 'argmax', 'legacy'],
+                default='stable',
+                description=descriptions.softmax_implementation,
+            )
+        )
+        softmax_attrs.append(
+            ConfigurableAttribute('skip', value_type=bool, default=False, description=descriptions.softmax_skip)
+        )
         softmax_attrs.append(
             TypeAttribute(
                 'exp_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         softmax_attrs.append(
             TypeAttribute(
                 'inv_table',
                 default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+                description=descriptions.table_type,
             )
         )
         self.attribute_map[Softmax] = softmax_attrs
@@ -238,10 +266,12 @@ def get_closest_reuse_factor(self, valid_rf, chosen_rf):
         else:
             return before
 
-    def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor'):
+    def set_closest_reuse_factor(self, layer, n_in, n_out, attribute='reuse_factor', include_max_rf=True):
         assert attribute is not None, 'Reuse factor attribute cannot be None'
 
         valid_rf = self.get_valid_reuse_factors(n_in, n_out)
+        if not include_max_rf:
+            valid_rf.pop()
         chosen_rf = layer.get_attr(attribute)
         if chosen_rf not in valid_rf:
             closest_rf = self.get_closest_reuse_factor(valid_rf, chosen_rf)
diff --git a/hls4ml/backends/fpga/fpga_layers.py b/hls4ml/backends/fpga/fpga_layers.py
index 356973517c..0026ebe213 100644
--- a/hls4ml/backends/fpga/fpga_layers.py
+++ b/hls4ml/backends/fpga/fpga_layers.py
@@ -73,12 +73,14 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
 class PointwiseConv1D(Conv1D):
     '''Optimized Conv1D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
 
 
 class PointwiseConv2D(Conv2D):
     '''Optimized Conv2D implementation for 1x1 kernels.'''
 
-    # Nothing to do, will pick up function and config from class name
-    pass
+    def initialize(self):
+        # Do noting, values copied
+        pass
diff --git a/hls4ml/backends/fpga/passes/clone.py b/hls4ml/backends/fpga/passes/clone.py
index 306e839900..a36d96dfa8 100644
--- a/hls4ml/backends/fpga/passes/clone.py
+++ b/hls4ml/backends/fpga/passes/clone.py
@@ -1,4 +1,4 @@
-import numpy as np
+from math import prod
 
 from hls4ml.backends.template import FunctionCallTemplate
 from hls4ml.model.layers import Layer, register_layer
@@ -54,41 +54,60 @@ def match(self, node):
         if isinstance(node, Clone):
             return False
 
-        return True
+        # Not needed for io_parallel
+        io_type = node.model.config.get_config_value('IOType')
+        if io_type != 'io_stream':
+            return False
+
+        # Check if the output is used more than once
+        output_map = node.get_output_use_map()
+        in_output = node.name in node.model.outputs
+        for output in node.outputs:
+            if len(output_map[output]) + in_output > 1:
+                # model output also need a stream
+                return True
+
+        return False
 
     def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_stream':
-            return False
 
         output_map = node.get_output_use_map()
+        in_output = node.name in node.model.outputs
 
         transformed = False
         for output in node.outputs:
-            if len(output_map[output]) > 1:
-                if len(output_map[output]) > 3:
-                    print(
-                        'WARNING: Cloning output {} of {} ({}) more than 3 times not currently supported'.format(
-                            output, node.__class__.__name__, node.name
-                        )
-                    )
-                    return False
-                out_var = node.get_output_variable(output)
-                for i, layer in enumerate(output_map[output], 1):
-                    attrs = {'size': np.prod(out_var.shape)}
-                    idx = layer.inputs.index(output)
-                    layer.inputs[idx] = output + '_cpy' + str(i)
-
-                clone_layer: Clone = model.make_node(
-                    Clone,
-                    'clone_' + node.name,
-                    attrs,
-                    [output],
-                    [output + '_cpy' + str(i + 1) for i in range(len(output_map[output]))],
-                )
-                for i in range(len(output_map[output])):
-                    key = output + '_cpy' + str(i + 1)
-                    clone_layer.attributes[key].type = node.attributes['result_t']
-                model.insert_node(clone_layer)
-                transformed = True
+            n_outputs = len(output_map[output]) + in_output
+            if n_outputs == 1:
+                continue
+            if n_outputs > 3:
+                msg = f'ERROR: Cloning output {output} of {node.class_name}\
+                      ({node.name}) more than 3 times not currently supported'
+                raise ValueError(msg)
+
+            out_var = node.get_output_variable(output)
+            attrs = {'size': prod(out_var.shape)}
+
+            init_stream_idx = 1
+            if in_output:
+                # If the value is used as output, add one extra stream
+                idx = node.model.outputs.index(node.name)
+                node.model.outputs[idx] = node.name + '_cpy1'
+                init_stream_idx = 2
+            for i, layer in enumerate(output_map[output], init_stream_idx):
+                idx = layer.inputs.index(output)
+                layer.inputs[idx] = output + f'_cpy{i}'
+
+            clone_layer: Clone = model.make_node(
+                Clone,
+                'clone_' + node.name,
+                attrs,
+                [output],
+                [output + '_cpy' + str(i + 1) for i in range(n_outputs)],
+            )
+            for i in range(n_outputs):
+                key = output + '_cpy' + str(i + 1)
+                clone_layer.attributes[key].type = node.attributes['result_t']
+            model.insert_node(clone_layer)
+            transformed = True
 
         return transformed
diff --git a/hls4ml/backends/fpga/passes/codegen.py b/hls4ml/backends/fpga/passes/im2col_codegen.py
similarity index 100%
rename from hls4ml/backends/fpga/passes/codegen.py
rename to hls4ml/backends/fpga/passes/im2col_codegen.py
diff --git a/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py b/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
index 532becc9db..82efe67100 100644
--- a/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
+++ b/hls4ml/backends/fpga/passes/inplace_parallel_reshape.py
@@ -11,14 +11,21 @@ class InplaceParallelReshape(OptimizerPass):
     """
 
     def match(self, node):
-        return isinstance(node, Reshape)
-
-    def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_parallel':
+        if not isinstance(node, Reshape):
             return False
+        return node.model.config.get_config_value('IOType') == 'io_parallel'
 
+    def transform(self, model, node):
         outvar = node.get_output_variable()
         invar = node.get_input_variable()
         newoutvar = InplaceTensorVariable(outvar, invar)
         node.set_attr(node.outputs[0], newoutvar)
+        if node.name in model.outputs:
+            prev_node = node.get_input_node()
+            assert (
+                prev_node.name not in model.outputs
+            ), f"Cannot output node {prev_node.name}: reshape is a no-op in io_parallel.\
+            As a result, the previous node {prev_node.name}'s output will be used as the\
+            output. However, this node is already an output."
+            model.outputs = [name if name != node.name else prev_node.name for name in model.outputs]
         return False
diff --git a/hls4ml/backends/fpga/passes/inplace_stream_flatten.py b/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
index a16ffefc4a..be4994e96e 100644
--- a/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
+++ b/hls4ml/backends/fpga/passes/inplace_stream_flatten.py
@@ -11,13 +11,20 @@ class InplaceStreamFlatten(OptimizerPass):
     """
 
     def match(self, node):
-        # Reshape acts as a Flatten layer when the result has 1 dimension
-        return isinstance(node, Reshape) and len(node.get_output_variable().shape) == 1
+        # Layers require flatten data can gather it from the stream, no need for repacking.
+        # Reshape acts as a Flatten layer when the result has 1 dimension. Make it a inplace tensor if it happens.
 
-    def transform(self, model, node):
-        if model.config.get_config_value('IOType') != 'io_stream':
+        if node.model.config.get_config_value('IOType') != 'io_stream':
+            return False
+        if not (isinstance(node, Reshape) and len(node.get_output_variable().shape) == 1):
+            # If is not flatten
             return False
+        if node.name in node.model.outputs:
+            # If used as model output. Output shape shall be preserved in this case.
+            return False
+        return True
 
+    def transform(self, model, node):
         outvar = node.get_output_variable()
         invar = node.get_input_variable()
         newoutvar = InplaceTensorVariable(outvar, invar)
diff --git a/hls4ml/backends/fpga/passes/repack_stream.py b/hls4ml/backends/fpga/passes/repack_stream.py
index 2408ec5ebe..9a77dddb29 100644
--- a/hls4ml/backends/fpga/passes/repack_stream.py
+++ b/hls4ml/backends/fpga/passes/repack_stream.py
@@ -49,7 +49,9 @@ class ReshapeStream(OptimizerPass):
 
     def match(self, node):
         # do not run optimizer pass for a flatten layer (1 output dimension)
-        return isinstance(node, Reshape) and len(node.get_output_variable().shape) > 1
+        if not isinstance(node, Reshape):
+            return False
+        return len(node.get_output_variable().shape) > 1 or node.name in node.model.outputs
 
     def transform(self, model, node):
         if model.config.get_config_value('IOType') != 'io_stream':
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 801174832d..7d0f0d48e2 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -10,6 +10,7 @@
 from hls4ml.model.layers import GRU, LSTM, Activation, Conv1D, Conv2D, Dense, Embedding, Layer, SimpleRNN, Softmax
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
+from hls4ml.utils import attribute_descriptions as descriptions
 
 # from hls4ml.report import parse_oneapi_report
 
@@ -30,9 +31,20 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
+            self.attribute_map[layer] = attrs
+
+        # Add ParallelizationFactor to Conv1D/2D
+        pf_layers = [
+            Conv1D,
+            Conv2D,
+        ]
+
+        for layer in pf_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
diff --git a/hls4ml/backends/oneapi/oneapi_template.py b/hls4ml/backends/oneapi/oneapi_template.py
index b11191939d..c86b8f7ea3 100644
--- a/hls4ml/backends/oneapi/oneapi_template.py
+++ b/hls4ml/backends/oneapi/oneapi_template.py
@@ -6,6 +6,15 @@
 
 
 class StreamFunctionCallTemplate(Template):
+    """Base class for the streaming function call templates in oneAPI:  provides the 'stream_function_cpp' attribute.
+    This generally provides the async call to the task sequence that executes the streaming function.
+
+    Note:  the include header files are specified in the regular FunctionCallTemplate, not here.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
@@ -24,6 +33,13 @@ def transform(self, model, node):
 
 
 class TaskSequenceTemplate(Template):
+    """Base class for the task sequence definition in oneAPI:  provides the 'task_sequence_cpp' attribute.
+    This defines the task sequence that is then called by the StreamFunctionCallTemplate.
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
diff --git a/hls4ml/backends/oneapi/passes/convolution_templates.py b/hls4ml/backends/oneapi/passes/convolution_templates.py
index 17154559d8..64d9e42228 100644
--- a/hls4ml/backends/oneapi/passes/convolution_templates.py
+++ b/hls4ml/backends/oneapi/passes/convolution_templates.py
@@ -1,7 +1,7 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm
+from hls4ml.model.layers import Conv1D, Conv2D, Conv2DBatchnorm, DepthwiseConv1D, DepthwiseConv2D
 
 # TODO - Dilation rate ?
 
@@ -70,9 +70,20 @@
 conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
 
+depthconv1d_function_template = (
+    'nnet::depthwise_conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv1d_include_list = [
+    'nnet_utils/nnet_conv1d.h',
+    'nnet_utils/nnet_conv1d_resource.h',
+    'nnet_utils/nnet_depthconv1d.h',
+    'nnet_utils/nnet_depthconv1d_resource.h',
+]
+
+
 class Conv1DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__(Conv1D)
+        super().__init__((Conv1D, DepthwiseConv1D))
         self.template = conv1d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -137,6 +148,12 @@ def format(self, node):
         return self.template.format(**params)
 
 
+class DepthwiseConv1DFunctionTemplate(Conv1DFunctionTemplate):
+    def __init__(self):
+        super(Conv1DFunctionTemplate, self).__init__(DepthwiseConv1D, include_header=depthconv1d_include_list)
+        self.template = depthconv1d_function_template
+
+
 ''' 2D Conv '''
 conv2d_config_template = """struct config{index} : nnet::conv2d_config {{
     static const unsigned in_height = {in_height};
@@ -183,7 +200,7 @@ def format(self, node):
 
 class Conv2DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Conv2D, Conv2DBatchnorm))
+        super().__init__((Conv2D, Conv2DBatchnorm, DepthwiseConv2D))
         self.template = conv2d_config_template
         self.mult_template = conv_mult_config_template
 
@@ -233,3 +250,20 @@ def format(self, node):
             raise RuntimeError('channels_first not supported on oneAPI')
         params['data_format'] = 'cl'
         return self.template.format(**params)
+
+
+depthconv2d_function_template = (
+    'nnet::depthwise_conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+)
+depthconv2d_include_list = [
+    'nnet_utils/nnet_conv2d.h',
+    'nnet_utils/nnet_conv2d_resource.h',
+    'nnet_utils/nnet_depthconv2d.h',
+    'nnet_utils/nnet_depthconv2d_resource.h',
+]
+
+
+class DepthwiseConv2DFunctionTemplate(Conv2DFunctionTemplate):
+    def __init__(self):
+        super(Conv2DFunctionTemplate, self).__init__(DepthwiseConv2D, include_header=depthconv2d_include_list)
+        self.template = depthconv2d_function_template
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 916d9b196e..5ccf1a5213 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -347,5 +347,5 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['param'] = node.get_weights('alpha').name
+        params['param'] = node.get_weights('param').name
         return self.template.format(**params)
diff --git a/hls4ml/backends/oneapi/passes/merge_templates.py b/hls4ml/backends/oneapi/passes/merge_templates.py
index c38e1e055f..9d261e1f74 100644
--- a/hls4ml/backends/oneapi/passes/merge_templates.py
+++ b/hls4ml/backends/oneapi/passes/merge_templates.py
@@ -10,6 +10,7 @@
 # Merge templates
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/quartus/passes/merge_templates.py b/hls4ml/backends/quartus/passes/merge_templates.py
index 0cf6121666..f71489a5cf 100644
--- a/hls4ml/backends/quartus/passes/merge_templates.py
+++ b/hls4ml/backends/quartus/passes/merge_templates.py
@@ -9,6 +9,7 @@
 # Merge templates
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py
index 0f7f6821ae..d65ab22569 100644
--- a/hls4ml/backends/quartus/passes/pointwise.py
+++ b/hls4ml/backends/quartus/passes/pointwise.py
@@ -1,5 +1,3 @@
-from copy import copy
-
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
 from hls4ml.backends.quartus.passes.convolution_templates import (
     Conv1DConfigTemplate,
@@ -81,10 +79,10 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
+        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
         pw_node = model.make_node(
-            'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy()
+            'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
-        pw_node.weights['bias'].data = node.weights['bias'].data
         model.replace_node(node, pw_node)
 
         return True
diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
index aecad642c6..6e596fe2d1 100644
--- a/hls4ml/backends/quartus/quartus_backend.py
+++ b/hls4ml/backends/quartus/quartus_backend.py
@@ -11,6 +11,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
 from hls4ml.report import parse_quartus_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 @contextmanager
@@ -39,16 +40,21 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
-        streaming_passes = ['quartus:reshape_stream', 'quartus:clone_output']
+        streaming_passes = [
+            'quartus:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
+            'quartus:reshape_stream',
+            'quartus:clone_output',
+        ]
+
         streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
 
         quartus_types = [
diff --git a/hls4ml/backends/template.py b/hls4ml/backends/template.py
index 9638b53add..f7f6fe313a 100644
--- a/hls4ml/backends/template.py
+++ b/hls4ml/backends/template.py
@@ -2,6 +2,14 @@
 
 
 class Template(OptimizerPass):
+    """The Template base class, should not be instantiated directly
+
+    Args:
+        name (str): Name of the template.
+        layer_class (Layer or list, tuple, or aet of Layers): The Layers that this template handles.
+        attribute_name (str):  The type of attribute provided
+    """
+
     def __init__(self, name, layer_class, attribute_name):
         self.name = name
         self.layer_class = layer_class
@@ -36,6 +44,12 @@ def _default_params(self, node):
 
 
 class LayerConfigTemplate(Template):
+    """Base class for layer config templates:  provides the 'config_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+    """
+
     def __init__(self, layer_class):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
@@ -53,6 +67,13 @@ def _default_config_params(self, layer):
 
 
 class FunctionCallTemplate(Template):
+    """Base class for function call templates:  provides the 'function_cpp' attribute
+
+    Args:
+        layer_class (Layer or list, tuple, or set of Layers): The Layers that this template handles.
+        include_header (list, tuple, or set of str, or None):  The list of needed include files
+    """
+
     def __init__(self, layer_class, include_header=None):
         if isinstance(layer_class, (list, tuple, set)):
             name = '_'.join([cls.__name__.lower() for cls in layer_class])
diff --git a/hls4ml/backends/vitis/passes/feature_check.py b/hls4ml/backends/vitis/passes/feature_check.py
index d7f9c2a7f5..a38f6581f6 100644
--- a/hls4ml/backends/vitis/passes/feature_check.py
+++ b/hls4ml/backends/vitis/passes/feature_check.py
@@ -14,7 +14,7 @@ def transform(self, model, node):
             node.set_attr('implementation', 'linebuffer')
 
 
-class ValidateStrategy(OptimizerPass):
+class ValidateResourceStrategy(OptimizerPass):
     _resource_layer_cls = ['Conv1D', 'Conv2D', 'Dense']
 
     def match(self, node):
@@ -29,6 +29,23 @@ def transform(self, model, node):
         if rf > n_in and rf % n_in > 0:
             print(
                 f'WARNING: "Resource" strategy in "{node.name}" ({node.class_name}) may have suboptimal QoR in Vitis '
-                'backend due to use of "urem" cores.\n'
-                'Consider using a different ReuseFactor or switching to "Latency" strategy.'
+                'backend due to use of "urem" cores in Vitis HLS <= 2022.1.\n'
+                'Consider using a different ReuseFactor or switching to "Latency" strategy if using older versions '
+                'of Vitis HLS.'
             )
+
+
+class ValidateResourceUnrolledStrategy(OptimizerPass):
+    _unrolled_layer_cls = ['Conv1D', 'Conv2D', 'Dense', 'GRU', 'LSTM']
+
+    def match(self, node):
+        is_unrolled_layer = len([layer_cls for layer_cls in self._unrolled_layer_cls if layer_cls in node.class_name]) > 0
+        is_unrolled_strategy = node.get_attr('strategy', 'latency').lower() == 'resource_unrolled'
+
+        return is_unrolled_layer and is_unrolled_strategy
+
+    def transform(self, model, node):
+        print(
+            f'WARNING: "ResourceUnrolled" strategy in "{node.name}" ({node.class_name}) may have unexpected II in'
+            'Vitis backend.\nVerify that the final design satisfies the latency/II constraints.'
+        )
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 89484237f3..0110f78313 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -15,7 +15,8 @@ def __init__(self):
     def _register_flows(self):
         validation_passes = [
             'vitis:validate_conv_implementation',
-            'vitis:validate_strategy',
+            'vitis:validate_resource_strategy',
+            'vitis:validate_resource_unrolled_strategy',
         ]
         validation_flow = register_flow('validation', validation_passes, requires=['vivado:init_layers'], backend=self.name)
 
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 7f3832ba28..e098107eae 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -22,6 +22,8 @@
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -58,6 +60,8 @@
     typedef {config_t} mult_config;
     template<unsigned K, unsigned S, unsigned W>
     using scale_index = nnet::{scale_index_type}<K, S, W>;
+    template<class data_T, class res_T, class CONFIG_T>
+    using conv_kernel = nnet::{conv_fn}<data_T, res_T, CONFIG_T>;
 }};
 const ap_uint<config{index}::filt_width> config{index}::pixels[] = {{{instructions}}};\n"""
 
@@ -91,15 +95,46 @@ def format(self, node):
         else:
             params['fill_fn'] = 'FillConv1DBuffer'
 
+        is_pointwise_parallel_latency = (
+            node.get_attr('filt_width') == 1
+            and node.get_attr('strategy').lower() == 'latency'
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+        )
+        if is_pointwise_parallel_latency:
+            params['conv_fn'] = f'pointwise_conv_{node.index}'
+        else:
+            if node.get_attr('strategy').lower() == 'latency':
+                params['conv_fn'] = 'Conv1DLatency'
+            else:
+                params['conv_fn'] = 'Conv1DResource'
+
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
-        mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
-        mult_params['n_out'] = node.get_attr('n_filt')
+        if is_pointwise_parallel_latency:
+            mult_params['n_in'] = int(
+                node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
+            )
+            mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
+        else:
+            mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
+            mult_params['n_out'] = node.get_attr('n_filt')
         mult_params['nzeros'] = node.get_weights('weight').nzeros
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -213,6 +248,18 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params['reuse_factor']) <= int(mult_params['n_in']):
+                mult_params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         mult_config = self.mult_template.format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -297,6 +344,8 @@ def format(self, node):
             params['scale_index_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_depthwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_config = self.depthwise_template.format(**params)
 
         # Depthwise mult config
@@ -309,6 +358,9 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to depthwise Conv1D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
         depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
 
         # Pointwise config
@@ -338,6 +390,8 @@ def format(self, node):
             params['scale_index_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_pointwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_config = self.pointwise_template.format(**params)
 
         # Pointwise mult config
@@ -350,6 +404,9 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to separable Conv1D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
+
         pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
 
         return (
@@ -425,6 +482,8 @@ def format(self, node):
             params['scale_index_width_type'] = 'scale_index_regular'
 
         params['config_t'] = f'config{node.index}_depthwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_config = self.depthwise_template.format(**params)
 
         # Depthwise mult config
@@ -437,6 +496,8 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('depthwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to depthwise Conv2D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
         depthwise_mult_config = self.depthwise_mult_template.format(**mult_params)
 
         # Pointwise config
@@ -474,6 +535,8 @@ def format(self, node):
         else:
             params['scale_index_width_type'] = 'scale_index_regular'
         params['config_t'] = f'config{node.index}_pointwise_mult'
+        # TODO - Extend unrolled Dense Resource
+        params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_config = self.pointwise_template.format(**params)
 
         # Pointwise mult config
@@ -486,6 +549,8 @@ def format(self, node):
         mult_params['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('pointwise').type.precision
         )
+        # TODO - Extend unrolled Dense Resource to separable Conv2D
+        mult_params['unrolled_function'] = 'DenseResourceUnrolled'
         pointwise_mult_config = self.pointwise_mult_template.format(**mult_params)
 
         return (
diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index b20a89f9ad..836da6e68a 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -19,6 +19,8 @@
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -41,6 +43,17 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
+        if node.get_attr('strategy').lower() == 'latency':
+            params['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(params['reuse_factor']) <= int(params['n_in']):
+                params['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                params['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            params['dense_function'] = f'dense_resource_unrolled_{node.index}'
+
         return self.template.format(**params)
 
 
diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
index 078e004d33..35aa5d3640 100644
--- a/hls4ml/backends/vivado/passes/merge_templates.py
+++ b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -6,6 +6,7 @@
 
 merge_config_template = """struct config{index} : nnet::merge_config {{
     static const unsigned n_elem = {n_elem};
+    static const unsigned reuse_factor = {reuse};
 }};\n"""
 
 merge_function_template = 'nnet::{merge}<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
diff --git a/hls4ml/backends/vivado/passes/pipeline_style.py b/hls4ml/backends/vivado/passes/pipeline_style.py
new file mode 100644
index 0000000000..66c2bbe71e
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/pipeline_style.py
@@ -0,0 +1,131 @@
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.optimizer import ModelOptimizerPass
+
+
+class SetPipelineStyle(ModelOptimizerPass):
+    def __init__(self):
+        pass
+
+    def transform(self, model):
+        if model.config.pipeline_style not in ['auto', 'pipeline', 'dataflow']:
+            print(
+                f'WARNING: Pipeline style set to {model.config.pipeline_style}, valid values: auto, pipeline, dataflow. '
+                'Using "auto".'
+            )
+            self._set_pipeline_style(model, 'auto')
+
+        if model.config.pipeline_style is None or model.config.pipeline_style == 'auto':
+
+            if self._maybe_set_dataflow_io_stream(model):
+                return True
+
+            if self._maybe_set_dataflow_conv_layers(model):
+                return True
+
+            if self._maybe_set_dataflow_resource_strategy(model):
+                return True
+
+            if self._maybe_set_pipeline_resource_unrolled_strategy(model):
+                return True
+
+            if self._maybe_set_pipeline_io_parallel(model):
+                return True
+
+            self._set_safe_default_dataflow(model)
+            return True
+        else:
+            self._validate_hls_config(model)
+
+        return False  # No model changes made
+
+    def _set_pipeline_style(self, model, pipeline_style):
+        # Could add logging here
+        model.config.pipeline_style = pipeline_style
+
+    def _maybe_set_dataflow_io_stream(self, model):
+        if model.config.get_config_value('IOType') == 'io_stream':
+            self._set_pipeline_style(model, 'dataflow')
+            return True
+
+        return False
+
+    def _maybe_set_dataflow_conv_layers(self, model):
+        for layer in model.get_layers():
+            if isinstance(layer, (Conv1D, Conv2D)):
+                self._set_pipeline_style(model, 'dataflow')
+                return True
+
+        return False
+
+    def _maybe_set_dataflow_resource_strategy(self, model):
+        for layer in model.get_layers():
+            if model.config.is_resource_strategy(layer):
+                self._set_pipeline_style(model, 'dataflow')
+                return True
+
+        return False
+
+    def _maybe_set_pipeline_resource_unrolled_strategy(self, model):
+        have_unrolled = False
+        for layer in model.get_layers():
+            if model.config.get_strategy(layer).lower() == 'resource_unrolled':
+                self._set_pipeline_style(model, 'pipeline')
+                have_unrolled = True
+                break
+
+        if have_unrolled:
+            model.config.pipeline_ii = max([int(layer.get_attr('reuse_factor')) for layer in model.get_layers()])
+
+        return have_unrolled
+
+    def _maybe_set_pipeline_io_parallel(self, model):
+        if model.config.get_config_value('IOType') == 'io_parallel':
+            self._set_pipeline_style(model, 'pipeline')
+            return True
+
+        return False
+
+    def _set_safe_default_dataflow(self, model):
+        print(
+            'WARNING: Couldn\'t determine best pipeline style, defaulting to "DATAFLOW". '
+            'Use "PipelineStyle" property to override.'
+        )
+        self._set_pipeline_style(model, 'dataflow')
+
+    def _validate_hls_config(self, model):
+        if model.config.pipeline_style.lower() == 'pipeline':
+            if model.config.model_compression:
+                print('WARNING: Compression enabled while pipeline style set to "pipeline".')
+            if model.config.model_strategy.lower() == 'resource':
+                print(
+                    'WARNING: Model strategy "Resource" will lead to bad QoR in combination '
+                    'with pipeline style set to "pipeline".'
+                )
+            if any(isinstance(layer, (Conv1D, Conv2D)) for layer in model.get_layers()):
+                print('WARNING: Convolution layers require "dataflow" pipeline style.')
+        for layer_type, strategy in model.config.layer_type_strategy.items():
+            if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline':
+                print(
+                    f'WARNING: Strategy for layer type {layer_type} set to "Resource", while pipeline style set to '
+                    '"pipeline". This will lead to bad QoR.'
+                )
+
+        for layer_name, strategy in model.config.layer_name_strategy.items():
+            if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline':
+                print(
+                    'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format(
+                        layer_name
+                    )
+                )
+
+        for layer_type, compression in model.config.layer_type_compression.items():
+            if compression and model.config.pipeline_style.lower() == 'pipeline':
+                print(
+                    'WARNING: Compression enabled for layer type {}, while pipeline style set to "pipeline".'.format(
+                        layer_type
+                    )
+                )
+
+        for layer_name, compression in model.config.layer_name_compression.items():
+            if compression and model.config.pipeline_style.lower() == 'pipeline':
+                print(f'WARNING: Compression enabled for layer {layer_name}, while pipeline style set to "pipeline".')
diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index 85d2635cb8..34568b09f7 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -1,5 +1,3 @@
-from copy import copy
-
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
 from hls4ml.backends.vivado.passes.convolution_templates import (
     Conv1DConfigTemplate,
@@ -75,8 +73,11 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy())
-        pw_node.weights['bias'].data = node.weights['bias'].data
+        # to remove warning, since these get set again
+        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        pw_node = model.make_node(
+            'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
+        )
         # Set strategy to ensure lowercase string is passed to the template
         if model.config.is_resource_strategy(pw_node):
             pw_node.set_attr('strategy', 'resource')
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
new file mode 100644
index 0000000000..d41d51f82f
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/pointwise_codegen.py
@@ -0,0 +1,84 @@
+from hls4ml.model.layers import Conv1D
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import Source
+
+
+def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1):
+    """Generate a C++ function for a pointwise convolution layer.
+
+    Args:
+        layer_idx (int): Index of layer ('index' attribute).
+        reuse_factor (int): Number of partitions to divide the input into.
+
+    Returns:
+        str: Generated C++ function
+    """
+
+    generated_code = (
+        'template<class data_T, class res_T, typename CONFIG_T>\n'
+        'class pointwise_conv_{index} : public Conv1DKernel<data_T, res_T, CONFIG_T> {{\n'
+        '  public:\n'
+        '    static void conv(\n'
+        '                     data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n'
+        '                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n'
+        '                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n'
+        '        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n'
+        '        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n'  # noqa: E501
+        '        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n'
+        '    RFInputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerInputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n'
+        '                #pragma HLS UNROLL\n'
+        '                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n'  # noqa: E501
+        '            }}\n'
+        '        }}\n\n'
+    ).format(index=layer_idx)
+    indent = '        '
+    for i in range(reuse_factor):
+        generated_code += indent
+        generated_code += (
+            f'pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n'
+        )
+
+    generated_code += (
+        '\n'
+        '    RFOutputLoop:\n'
+        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n'
+        '        #pragma HLS UNROLL\n'
+        '        InnerOutputLoop:\n'
+        '            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n'
+        '                #pragma HLS UNROLL\n'
+        '                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n'  # noqa: E501
+        '            }\n'
+        '        }\n'
+        '    }\n'
+        '};\n'
+    )
+
+    return generated_code
+
+
+class GeneratePointwiseConv1D(OptimizerPass):
+    '''Generates code for pointwise 1D convolution'''
+
+    def match(self, node):
+        return (
+            isinstance(node, Conv1D)
+            and node.model.config.get_config_value('IOType') == 'io_parallel'
+            and node.get_attr('filt_width') == 1
+        )
+
+    def transform(self, model, node):
+        self._generate_pointwise_conv1d(node)
+
+    def _generate_pointwise_conv1d(self, node):
+        code_str = generate_pointwise_conv1d_fn(
+            node.get_attr('index'),
+            node.get_attr('reuse_factor'),
+        )
+
+        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/passes/recurrent_templates.py b/hls4ml/backends/vivado/passes/recurrent_templates.py
index adf95defdc..939713af22 100644
--- a/hls4ml/backends/vivado/passes/recurrent_templates.py
+++ b/hls4ml/backends/vivado/passes/recurrent_templates.py
@@ -16,7 +16,8 @@
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
-    typedef {index_t.name} index_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -115,11 +116,11 @@ def format(self, node):
         act_params['type'] = node.get_attr('activation')
         recr_act_params['type'] = node.get_attr('recurrent_activation')
         if node.get_attr('return_sequences'):
-            act_params['n_in'] = node.get_output_variable().dim_names[1]
-            recr_act_params['n_in'] = node.get_output_variable().dim_names[1] + ' * %i' % (n_recr_mult - 1)
+            act_params['n_in'] = node.get_output_variable().shape[1]
+            recr_act_params['n_in'] = node.get_output_variable().shape[1] * (n_recr_mult - 1)
         else:
-            act_params['n_in'] = node.get_output_variable().dim_names[0]
-            recr_act_params['n_in'] = node.get_output_variable().dim_names[0] + ' * %i' % (n_recr_mult - 1)
+            act_params['n_in'] = node.get_output_variable().shape[0]
+            recr_act_params['n_in'] = node.get_output_variable().shape[0] * (n_recr_mult - 1)
 
         act_config = self.act_template.format(**act_params)
         recr_act_config = self.recr_act_template.format(**recr_act_params)
@@ -127,11 +128,11 @@ def format(self, node):
         mult_params1 = self._default_config_params(node)
         mult_params2 = self._default_config_params(node)
 
-        mult_params1['n_in'] = node.get_input_variable().dim_names[1]
+        mult_params1['n_in'] = node.get_input_variable().shape[1]
         if node.get_attr('return_sequences'):
-            mult_params1['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult
+            mult_params1['n_out'] = node.get_output_variable().shape[1] * n_recr_mult
         else:
-            mult_params1['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult
+            mult_params1['n_out'] = node.get_output_variable().shape[0] * n_recr_mult
         mult_params1['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
@@ -139,12 +140,24 @@ def format(self, node):
         mult_params1['index'] = str(node.index) + '_1'
         mult_params1['nzeros'] = node.get_weights('weight').nzeros
         mult_params1['nonzeros'] = node.get_weights('weight').nonzeros
+
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params1['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params1['reuse_factor']) <= int(mult_params1['n_in']):
+                mult_params1['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params1['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params1['dense_function'] = f'dense_resource_unrolled_{node.index}_1'
+
         if node.get_attr('return_sequences'):
-            mult_params2['n_in'] = node.get_output_variable().dim_names[1]
-            mult_params2['n_out'] = node.get_output_variable().dim_names[1] + ' * %i' % n_recr_mult
+            mult_params2['n_in'] = node.get_output_variable().shape[1]
+            mult_params2['n_out'] = node.get_output_variable().shape[1] * n_recr_mult
         else:
-            mult_params2['n_in'] = node.get_output_variable().dim_names[0]
-            mult_params2['n_out'] = node.get_output_variable().dim_names[0] + ' * %i' % n_recr_mult
+            mult_params2['n_in'] = node.get_output_variable().shape[0]
+            mult_params2['n_out'] = node.get_output_variable().shape[0] * n_recr_mult
         mult_params2['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('recurrent_weight').type.precision
         )
@@ -153,6 +166,17 @@ def format(self, node):
         mult_params2['nzeros'] = node.get_weights('recurrent_weight').nzeros
         mult_params2['nonzeros'] = node.get_weights('recurrent_weight').nonzeros
 
+        if node.get_attr('strategy').lower() == 'latency':
+            mult_params2['dense_function'] = 'DenseLatency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(mult_params2['reuse_factor']) <= int(mult_params2['n_in']):
+                mult_params2['dense_function'] = 'DenseResource_rf_leq_nin'
+            else:
+                mult_params2['dense_function'] = 'DenseResource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+        elif node.get_attr('strategy').lower() == 'resource_unrolled':
+            mult_params2['dense_function'] = f'dense_resource_unrolled_{node.index}_2'
+
         mult_config1 = self.mult1_template.format(**mult_params1)
         mult_config2 = self.mult2_template.format(**mult_params2)
 
diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index 63e6e0b4db..0c06190f30 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -9,7 +9,7 @@ class ApplyResourceStrategy(OptimizerPass):
 
     def match(self, node):
         node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU))
-        is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource'
+        is_resource_strategy = node.get_attr('strategy', '').lower() in ['resource', 'resource_unrolled']
         already_transformed = node.get_attr('_weights_transposed', False) is True
 
         return node_matches and is_resource_strategy and not already_transformed
diff --git a/hls4ml/backends/vivado/passes/unrolled_codegen.py b/hls4ml/backends/vivado/passes/unrolled_codegen.py
new file mode 100644
index 0000000000..d901c77008
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/unrolled_codegen.py
@@ -0,0 +1,243 @@
+import math
+
+import numpy as np
+
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import Source
+
+
+class GenerateUnrolledDenseResource(OptimizerPass):
+    '''Generates C++ code for unrolled Dense resource'''
+
+    def match(self, node):
+        # Only apply to layers use that use Dense Matrix Multiplication
+        # TODO - Extend (& test) for Separable Conv / Depthwise Conv / Recurrent layers
+        layers_with_dense = (Dense, Conv1D, Conv2D, LSTM, GRU)
+
+        # Unrolled Dense mimics the hardware implementation of Resource strategy -> apply after Resource optimizer
+        weights_transposed = node.get_attr('_weights_transposed', False)
+
+        # RF = 1 will optimize DSPs anyway, so no need to unroll code
+        rf_gt_one = node.get_attr('reuse_factor', 1) > 1
+
+        # User requested unrolled implementation of Dense
+        is_unrolled = node.get_attr('strategy', 'latency') == 'resource_unrolled'
+
+        return isinstance(node, layers_with_dense) and weights_transposed and rf_gt_one and is_unrolled
+
+    def transform(self, model, node):
+        if isinstance(node, (LSTM, GRU)):
+            n_in, n_out, n_in_recr, n_out_recr = node.model.config.backend.get_layer_mult_size(node)
+
+            reuse_factor = node.get_attr('reuse_factor')
+            weights = node.weights['weight']
+            code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, str(node.index) + '_1')
+            code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend)
+            node.set_attr('resource_unrolled_dense_codegen_1', Source(code_str))
+
+            recr_reuse_factor = node.get_attr('recurrent_reuse_factor')
+            recr_weights = node.weights['recurrent_weight']
+            code_str = self._generate_unrolled_function(
+                n_in_recr, n_out_recr, recr_reuse_factor, recr_weights, str(node.index) + '_2'
+            )
+            code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend)
+            node.set_attr('resource_unrolled_dense_codegen_2', Source(code_str))
+
+        else:
+            n_in, n_out = node.model.config.backend.get_layer_mult_size(node)
+            reuse_factor = node.get_attr('reuse_factor')
+            weights = node.weights['weight']
+
+            code_str = self._generate_unrolled_function(n_in, n_out, reuse_factor, weights, node.index)
+            code_str = self._add_backend_specific_pragmas_to_generated_code(code_str, model.config.backend)
+            node.set_attr('resource_unrolled_dense_codegen', Source(code_str))
+
+    def _generate_unrolled_function(self, n_in, n_out, reuse_factor, weights, function_suffix):
+        """
+        Generate a C++ function that mimics the Dense Resource implementation.
+
+        The HLS compiler produces suboptimal designs for Dense Resource when the weights processed by the same DSP are zero.
+        Latency strategy can optimize zero multiplications
+        Resource strategy, on the other hand, cannot.
+        When all the weights in the same BRAM block are zero, Vivado is unable to optimize it
+        With this (and additional TCL scripts) zero BRAM are optimized
+
+        Args:
+            node: Layer to generate code for
+        Returns:
+            generated_code: Generated C++ function (string)
+        """
+
+        # Variable instantiation and function pragmas
+        generated_code = (
+            'template<class data_T, class res_T, typename CONFIG_T>\n'
+            'class dense_resource_unrolled_{suffix} : public DenseKernel<data_T, res_T, CONFIG_T> {{{{\n'
+            '    public:\n'
+            '    static void dense(\n'
+            '    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],\n'
+            '    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],\n'
+            '    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]\n'
+            '    ) {{{{\n'
+            '        #pragma HLS pipeline II=CONFIG_T::reuse_factor\n'
+            '\n'
+            '        constexpr int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);\n'
+            '        #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor\n'
+            '        {{weights_resource_pragma}}\n'
+            '        #pragma HLS ARRAY_PARTITION variable=biases complete\n'
+            '\n'
+            '        typename CONFIG_T::accum_t acc[CONFIG_T::n_out];\n'
+            '        #pragma HLS ARRAY_PARTITION variable=acc complete\n'
+            '\n'
+            '        InitAccum:\n'
+            '        for (int i = 0; i < CONFIG_T::n_out; i++) {{{{\n'
+            '            #pragma HLS UNROLL\n'
+            '            acc[i] = (typename CONFIG_T::accum_t) biases[i];\n'
+            '        }}}}\n'
+            '\n'
+        ).format(suffix=function_suffix)
+
+        # Unrolled multiplication, according to the three cases
+        if reuse_factor <= n_in:
+            mult_code = self._generate_unrolled_mult_code_rf_leq_nin(n_in, n_out, reuse_factor, weights)
+        elif reuse_factor > n_in and reuse_factor % n_in == 0:
+            mult_code = self._generate_unrolled_mult_code_rf_gt_nin_rem0(n_in, n_out, reuse_factor, weights)
+        else:
+            # This case shouldn't happen if my understanding of RF is correct
+            # The function fpga_backend._validate_reuse_factor() has assertion rf % n_in == 0 or rf < n_in
+            raise Exception('Not implemented...')
+
+        # Write output
+        generated_code += mult_code + '\n'
+        generated_code += (
+            '        Result:\n'
+            '        for (int i = 0; i < CONFIG_T::n_out; i++) {{\n'
+            '            #pragma HLS UNROLL\n'
+            '            res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);\n'
+            '        }}\n'
+            '    }}\n'
+            '}};\n'
+        )
+
+        return generated_code
+
+    def _generate_unrolled_mult_code_rf_leq_nin(self, n_in, n_out, reuse_factor, weights):
+        # Function constants
+        mult_factor = min(n_in, reuse_factor)
+        block_factor = int(math.ceil(n_in * n_out / reuse_factor))
+        mult_limit = int(math.ceil(n_in * n_out / mult_factor))
+        mult_scale = mult_limit // n_out
+
+        # Zero DSPs are the DSP blocks that always have zero input
+        # In this case, it is the number of rows in the transposed and reshaped weight matrix
+        # The new shape is (parallel_mult, reuse_factor)
+        zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
+
+        # Used to pad the code to make it human-readable
+        indent = '    '
+
+        # Generate unrolled multiplications
+        mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
+        mult_code += f'{indent*2}MULT: {{{{\n'
+
+        for ir in range(reuse_factor):
+            acc_step = 0
+            out_index = 0
+            w_index = ir
+            in_index = ir
+
+            mult_code += f'{indent*3}M{ir}: {{{{\n'
+            for _ in range(block_factor):
+                if weights.data.flatten()[w_index] != 0:
+                    mult_code += (
+                        f'{indent*4}acc[{out_index}] += '
+                        'static_cast<typename CONFIG_T::accum_t>'
+                        '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
+                        f'product(data[{in_index}], weights[{w_index}]));\n'
+                    )
+
+                w_index += reuse_factor
+                in_index += reuse_factor
+                if in_index >= n_in:
+                    in_index = ir
+                if acc_step + 1 >= mult_scale:
+                    acc_step = 0
+                    out_index += 1
+                else:
+                    acc_step += 1
+
+            mult_code += f'{indent*3}}}}}\n'
+
+        mult_code += f'{indent*2}}}}}\n'
+
+        return mult_code
+
+    def _generate_unrolled_mult_code_rf_gt_nin_rem0(self, n_in, n_out, reuse_factor, weights):
+        # Function constants
+        mult_factor = min(n_in, reuse_factor)
+        block_factor = int(math.ceil(n_in * n_out / reuse_factor))
+        mult_limit = int(math.ceil(n_in * n_out / mult_factor))
+
+        # Zero DSPs are the DSP blocks that always have zero input
+        # In this case, it is the number of rows in the transposed and reshaped weight matrix
+        # The new shape is (parallel_mult, reuse_factor)
+        zeros = np.sum(~weights.data.reshape(block_factor, reuse_factor).any(1))
+
+        # Used to pad the code to make it human-readable
+        indent = '    '
+
+        # Generate out indices
+        outidx = [0] * reuse_factor
+        outstep = 0
+        outscale = reuse_factor // n_in
+        for ir in range(reuse_factor):
+            outidx[ir] = outstep
+            if (ir + 1) % n_in == 0:
+                outstep += 1
+
+        # Define variables
+        in_index = 0
+
+        # Generate unrolled multiplications
+        mult_code = f'{indent*2}#pragma HLS ALLOCATION operation instances=mul limit={mult_limit - zeros}\n'
+        mult_code += f'{indent*2}MULT: {{{{\n'
+
+        for ir in range(reuse_factor):
+            w_index = ir
+            out_index = outidx[ir]
+
+            mult_code += f'{indent*3}M{ir}: {{{{\n'
+            for _ in range(block_factor):
+                if weights.data.flatten()[w_index] != 0:
+                    mult_code += (
+                        f'{indent*4}acc[{int(out_index)}] += '
+                        'static_cast<typename CONFIG_T::accum_t>'
+                        '(CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::'
+                        f'product(data[{in_index}], weights[{w_index}]));\n'
+                    )
+
+                w_index += reuse_factor
+                if w_index > n_in * n_out:
+                    break
+                out_index += outscale
+            mult_code += f'{indent*3}}}}}\n'
+
+            in_index += 1
+            if in_index >= n_in:
+                in_index = 0
+
+        mult_code += f'{indent*2}}}}}\n'
+
+        return mult_code
+
+    def _add_backend_specific_pragmas_to_generated_code(self, code, backend):
+        if backend.name == 'Vivado':
+            weights_resource_pragma = '#pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM'
+        elif backend.name == 'Vitis':
+            weights_resource_pragma = '#pragma HLS BIND_STORAGE variable=weights type=ROM_NP impl=BRAM'
+        else:
+            raise Exception(f'Unexpected backend {backend.name} in GenerateUnrolledDenseResource optimizer.')
+
+        code = code.format(weights_resource_pragma=weights_resource_pragma)
+
+        return code
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 2112a8db04..117805dd86 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -31,6 +31,7 @@
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
 from hls4ml.report import parse_vivado_report
+from hls4ml.utils import attribute_descriptions as descriptions
 
 
 class VivadoBackend(FPGABackend):
@@ -49,10 +50,12 @@ def _register_layer_attributes(self):
 
         for layer in rnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1))
-            attrs.append(ConfigurableAttribute('static', value_type=bool, default=True))
-            attrs.append(ConfigurableAttribute('table_size', default=1024))
-            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8)))
+            attrs.append(ConfigurableAttribute('recurrent_reuse_factor', default=1, description=descriptions.reuse_factor))
+            attrs.append(
+                ConfigurableAttribute('static', value_type=bool, default=True, description=descriptions.recurrent_static)
+            )
+            attrs.append(ConfigurableAttribute('table_size', default=1024, description=descriptions.table_size))
+            attrs.append(TypeAttribute('table', default=FixedPrecisionType(18, 8), description=descriptions.table_type))
             self.attribute_map[layer] = attrs
 
         # Add ParallelizationFactor to Conv1D/2D
@@ -63,16 +66,21 @@ def _register_layer_attributes(self):
 
         for layer in pf_layers:
             attrs = self.attribute_map.get(layer, [])
-            attrs.append(ConfigurableAttribute('parallelization_factor', default=1))
+            attrs.append(ConfigurableAttribute('parallelization_factor', default=1, description=descriptions.conv_pf))
             self.attribute_map[layer] = attrs
 
         # Add ConvImplementation to Convolution+Pooling layers
         cnn_layers = [Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, DepthwiseConv2D, Pooling1D, Pooling2D]
-
         for layer in cnn_layers:
             attrs = self.attribute_map.get(layer, [])
-            # attrs.append(ConfigurableAttribute('conv_implementation', value_type=str, default='LineBuffer'))
-            attrs.append(ChoiceAttribute('conv_implementation', choices=['LineBuffer', 'Encoded'], default='LineBuffer'))
+            attrs.append(
+                ChoiceAttribute(
+                    'conv_implementation',
+                    choices=['LineBuffer', 'Encoded'],
+                    default='LineBuffer',
+                    description=descriptions.conv_implementation,
+                )
+            )
             self.attribute_map[layer] = attrs
 
     def _register_flows(self):
@@ -80,6 +88,7 @@ def _register_flows(self):
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
         streaming_passes = [
+            'vivado:inplace_stream_flatten',  # Inform downstream changed packsize in case of skipping flatten
             'vivado:reshape_stream',
             'vivado:clone_output',
             'vivado:insert_zero_padding_before_conv1d',
@@ -114,6 +123,9 @@ def _register_flows(self):
             'vivado:generate_conv_streaming_instructions',
             'vivado:apply_resource_strategy',
             'vivado:generate_conv_im2col',
+            'vivado:generate_pointwise_conv1_d',
+            'vivado:generate_unrolled_dense_resource',
+            'vivado:set_pipeline_style',
         ]
         vivado_types_flow = register_flow('specific_types', vivado_types, requires=[init_flow], backend=self.name)
 
@@ -244,11 +256,6 @@ def build(
 
         return parse_vivado_report(model.config.get_output_dir())
 
-    def _validate_conv_strategy(self, layer):
-        if layer.model.config.pipeline_style.lower() != 'dataflow':
-            print(f'WARNING: Layer {layer.name} requires "dataflow" pipeline style. Switching to "dataflow" pipeline style.')
-            layer.model.config.pipeline_style = 'dataflow'
-
     @layer_optimizer(Layer)
     def init_base_layer(self, layer):
         reuse_factor = layer.model.config.get_reuse_factor(layer)
@@ -270,6 +277,22 @@ def init_dense(self, layer):
                 index_t = layer.get_weights('weight').type.index_precision
             else:
                 layer.set_attr('strategy', 'resource')
+        elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled':
+            use_resource_instead = False
+            if layer.get_attr('reuse_factor', 1) == 1:
+                print(
+                    f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            if use_resource_instead:
+                self.set_closest_reuse_factor(layer, n_in, n_out)
+                layer.set_attr('strategy', 'resource')
+            else:
+                self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
+                layer.set_attr('strategy', 'resource_unrolled')
         else:
             layer.set_attr('strategy', 'latency')
         layer.set_attr('index_t', NamedType(f'layer{layer.index}_index', index_t))
@@ -285,6 +308,28 @@ def init_conv1d(self, layer):
             n_in, n_out = self.get_layer_mult_size(layer)
             self.set_target_reuse_factor(layer)
             self.set_closest_reuse_factor(layer, n_in, n_out)
+        elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled':
+            use_resource_instead = False
+            if layer.get_attr('reuse_factor', 1) == 1:
+                print(
+                    f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}".'
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            elif layer.model.config.get_config_value('IOType') == 'io_parallel':
+                print(
+                    f'Unrolled resource strategy cannot be combined with io_parallel in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            if use_resource_instead:
+                self.set_closest_reuse_factor(layer, n_in, n_out)
+                layer.set_attr('strategy', 'resource')
+            else:
+                self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
+                layer.set_attr('strategy', 'resource_unrolled')
         else:
             layer.set_attr('strategy', 'latency')
 
@@ -315,8 +360,6 @@ def init_conv1d(self, layer):
 
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
-        self._validate_conv_strategy(layer)
-
     @layer_optimizer(SeparableConv1D)
     def init_sepconv1d(self, layer):
         if layer.model.config.is_resource_strategy(layer):
@@ -386,6 +429,28 @@ def init_conv2d(self, layer):
             self.set_target_reuse_factor(layer)
             n_in, n_out = self.get_layer_mult_size(layer)
             self.set_closest_reuse_factor(layer, n_in, n_out)
+        elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled':
+            use_resource_instead = False
+            if layer.get_attr('reuse_factor', 1) == 1:
+                print(
+                    f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            elif layer.model.config.get_config_value('IOType') == 'io_parallel':
+                print(
+                    f'Unrolled resource strategy cannot be combined with io_parallel in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            if use_resource_instead:
+                self.set_closest_reuse_factor(layer, n_in, n_out)
+                layer.set_attr('strategy', 'resource')
+            else:
+                self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
+                layer.set_attr('strategy', 'resource_unrolled')
         else:
             layer.set_attr('strategy', 'latency')
 
@@ -417,8 +482,6 @@ def init_conv2d(self, layer):
 
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
-        self._validate_conv_strategy(layer)
-
     @layer_optimizer(SeparableConv2D)
     def init_sepconv2d(self, layer):
         if layer.model.config.is_resource_strategy(layer):
@@ -441,8 +504,8 @@ def init_sepconv2d(self, layer):
             )
         else:
             closest_pf = chosen_pf
-        layer.set_attr('n_partitions', out_height * out_width // closest_pf)
 
+        layer.set_attr('n_partitions', out_height * out_width // closest_pf)
         layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())
 
         # Set the output type of the depthwise phase
@@ -511,6 +574,25 @@ def init_lstm(self, layer):
             self.set_closest_reuse_factor(layer, n_in, n_out)
             self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
             layer.set_attr('strategy', 'resource')
+        elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled':
+            use_resource_instead = False
+            if layer.get_attr('reuse_factor', 1) == 1:
+                print(
+                    f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            if use_resource_instead:
+                self.set_closest_reuse_factor(layer, n_in, n_out)
+                self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+                layer.set_attr('strategy', 'resource')
+            else:
+                self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
+                self.set_closest_reuse_factor(
+                    layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False
+                )
+                layer.set_attr('strategy', 'resource_unrolled')
         else:
             layer.set_attr('strategy', 'latency')
 
@@ -526,6 +608,25 @@ def init_gru(self, layer):
             self.set_closest_reuse_factor(layer, n_in, n_out)
             self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
             layer.set_attr('strategy', 'resource')
+        elif layer.model.config.get_strategy(layer).lower() == 'resource_unrolled':
+            use_resource_instead = False
+            if layer.get_attr('reuse_factor', 1) == 1:
+                print(
+                    f'Unrolled resource strategy cannot be combined with reuse factor 1 in layer "{layer.name}". '
+                    'Using "resource" strategy instead.'
+                )
+                use_resource_instead = True
+            n_in, n_out, n_in_recr, n_out_recr = self.get_layer_mult_size(layer)
+            if use_resource_instead:
+                self.set_closest_reuse_factor(layer, n_in, n_out)
+                self.set_closest_reuse_factor(layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor')
+                layer.set_attr('strategy', 'resource')
+            else:
+                self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
+                self.set_closest_reuse_factor(
+                    layer, n_in_recr, n_out_recr, attribute='recurrent_reuse_factor', include_max_rf=False
+                )
+                layer.set_attr('strategy', 'resource_unrolled')
         else:
             layer.set_attr('strategy', 'latency')
 
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 092e53b3d3..3d7ce1fe56 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -10,8 +10,7 @@
 from hls4ml.converters.keras_to_hls import get_supported_keras_layers  # noqa: F401
 from hls4ml.converters.keras_to_hls import parse_keras_model  # noqa: F401
 from hls4ml.converters.keras_to_hls import keras_to_hls, register_keras_layer_handler
-
-# from hls4ml.converters.pytorch_to_hls import parse_pytorch_model  # noqa: F401
+from hls4ml.converters.onnx_to_hls import parse_onnx_model  # noqa: F401
 from hls4ml.model import ModelGraph
 from hls4ml.utils.config import create_config
 from hls4ml.utils.symbolic_utils import LUTFunction
@@ -279,9 +278,10 @@ def convert_from_pytorch_model(
     Notes:
         Pytorch uses the "channels_first" data format for its tensors, while hls4ml expects the "channels_last" format
         used by keras. By default, hls4ml will automatically add layers to the model which transpose the inputs to the
-        "channels_last"format. Not that this is not supported for the "io_stream" io_type, for which the user will have
-        to transpose the input by hand before passing it to hls4ml. In that case the "inputs_channel_last" argument of
-        the "config_from_pytorch_model" function needs to be set to True. By default, the output of the model remains
+        "channels_last" format. Not that this is not supported for the "io_stream" io_type, for which the user will have
+        to transpose the input by hand before passing it to hls4ml. In that case the "channels_last_conversion" argument of
+        the "config_from_pytorch_model" function needs to be set to "internal". This argument can be used to completely
+        disable this internal conversion. By default, the output of the model remains
         in the "channels_last" data format. The "transpose_outputs" argument of the "config_from_pytorch_model" can be
         used to add a layer to the model that transposes back to "channels_first". As before, this will not work for
         io_stream.
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 67798ae7b1..637bb6d401 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -67,7 +67,8 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
         layer['class_name'] = 'ELU'  # always use ELU type for elu, even if passed as activation
 
     if layer['class_name'] == 'LeakyReLU':
-        layer['activ_param'] = keras_layer['config'].get('alpha', 0.3)
+        # the name changes for version 3
+        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.3))
     elif layer['class_name'] == 'ThresholdedReLU':
         layer['activ_param'] = keras_layer['config'].get('theta', 1.0)
     elif layer['class_name'] == 'ELU':
@@ -83,6 +84,10 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
         layer['class_name'] = 'HardActivation'
     if layer['class_name'] == 'Softmax':
         layer['axis'] = keras_layer['config'].get('axis', -1)
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'leaky_relu':
+        layer['class_name'] = 'LeakyReLU'
+        # The parameter name changes for API v3; the default is different than in LeakyReLU layer
+        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.2))
 
     return layer, [shape for shape in input_shapes[0]]
 
diff --git a/hls4ml/converters/keras/reshape.py b/hls4ml/converters/keras/reshape.py
index bd9d519a2a..1f6dc2a759 100644
--- a/hls4ml/converters/keras/reshape.py
+++ b/hls4ml/converters/keras/reshape.py
@@ -11,8 +11,8 @@ def parse_flatten_layer(keras_layer, input_names, input_shapes, data_reader):
     layer = parse_default_keras_layer(keras_layer, input_names)
 
     layer['class_name'] = 'Reshape'
-    layer['target_shape'] = [input_shapes[0][0], np.prod(input_shapes[0][1:])]
-    output_shape = layer['target_shape']
+    layer['target_shape'] = [np.prod(input_shapes[0][1:])]  # target shape has no batch dimension
+    output_shape = input_shapes[0][:1] + layer['target_shape']
 
     return layer, output_shape
 
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index f1150be15e..e31e2b96a9 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -297,26 +297,18 @@ def parse_keras_model(model_arch, reader):
         layer_list.append(layer)
         if 'activation' in layer and layer['class_name'] not in activation_layers + recurrent_layers:  # + qkeras_layers:
             act_layer = {}
+            act_details = layer['activation']
             # Workaround for QKeras activations passed as an argument
-            if isinstance(layer['activation'], dict):
-                act_details = layer['activation']
+            if isinstance(act_details, dict):
                 act_layer['class_name'] = 'QActivation'
                 act_layer['config'] = {
                     'name': layer['name'] + '_' + act_details['class_name'],
                     'activation': act_details,
                 }
-                act_layer, output_shape = layer_handlers['QActivation'](act_layer, None, [output_shape], reader)
             else:
-                act_layer['name'] = layer['name'] + '_' + layer['activation']
-                act_layer['activation'] = layer['activation']
-                if 'activ_param' in layer:
-                    act_layer['activ_param'] = layer['activ_param']
-                    act_layer['class_name'] = layer['activation']
-                elif layer['activation'] == 'softmax':
-                    act_layer['class_name'] = 'Softmax'
-                    act_layer['axis'] = -1
-                else:
-                    act_layer['class_name'] = 'Activation'
+                act_layer['class_name'] = 'Activation'
+                act_layer['config'] = {'name': layer['name'] + '_' + act_details, 'activation': act_details}
+            act_layer, output_shape = layer_handlers[act_layer['class_name']](act_layer, None, [output_shape], reader)
             inputs_map[layer['name']] = act_layer['name']
             if output_layers is not None and layer['name'] in output_layers:
                 output_layers = [act_layer['name'] if name == layer['name'] else name for name in output_layers]
diff --git a/hls4ml/converters/onnx/convolution.py b/hls4ml/converters/onnx/convolution.py
index 39b2232169..d84fb855a8 100644
--- a/hls4ml/converters/onnx/convolution.py
+++ b/hls4ml/converters/onnx/convolution.py
@@ -1,85 +1,77 @@
-from hls4ml.converters.onnx_to_hls import (
-    compute_pads_1d,
-    compute_pads_2d,
-    get_onnx_attribute,
-    get_onnx_input_name,
-    onnx_handler,
-)
-from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d
+import numpy as np
+
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
 
 @onnx_handler('Conv')
-def parse_conv_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_conv_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['name'] = node.name
-    layer['data_format'] = 'channels_first'  # ONNX's default is channel first
-    layer['inputs'] = get_onnx_input_name(node, graph)
-    reader.add_input(layer['name'], node.input)
+    if node.domain != 'qonnx.custom_op.channels_last':
+        raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last")
+    layer['data_format'] = 'channels_last'  # QONNX needs to be channels-last.
+    layer['inputs'] = input_names
+    layer['outputs'] = node.output
 
     strides = get_onnx_attribute(node, 'strides')
     kernel_shape = get_onnx_attribute(node, 'kernel_shape')
-
-    if len(input_shapes[0]) == 3:  # Conv1D
-        layer['class_name'] = 'Conv1D'
-
-        layer['in_width'] = input_shapes[0][2]
-        layer['n_chan'] = input_shapes[0][1]
-        layer['filt_width'] = kernel_shape[0]
-        layer['n_filt'] = reader.get_weights_data(layer['name'], 'kernel').shape[2]
-        layer['stride_width'] = strides[0]
-        pads = compute_pads_1d(node, layer)
-
+    # Note:  currently don't have support for auto_pad.
+    pads = get_onnx_attribute(node, 'pads')
+    dilations = get_onnx_attribute(node, 'dilations')
+    if dilations is None:
+        dilations = [1] * len(layer['kernel_shape'])
+
+    layer['in_width'] = input_shapes[0][-2]
+    layer['n_chan'] = input_shapes[0][-1]
+    layer['n_filt'] = input_shapes[1][0]
+
+    layer['group'] = int(get_onnx_attribute(node, 'group'))
+    if layer['group'] != 1:
+        layer['depth_multiplier'] = get_onnx_attribute(node, 'group') / layer['n_chan']
+        if not layer['depth_multiplier'].is_integer():
+            raise ValueError('Depth multiplier must be an integer')
+        else:
+            layer['depth_multiplier'] = int(layer['depth_multiplier'])
+
+    layer['n_dim'] = len(input_shapes[0]) - 2  # 2 comes from channels and batch dimentions
+    if layer['n_dim'] not in (1, 2):
+        raise ValueError("Only 1D and 2D convolutions are supported")
+    layer['class_name'] = 'Conv'
+
+    # set some values needed later
+    if layer['n_dim'] == 1:
+        # this is 1D convolution
+        full_width = layer['in_width'] + pads[0] + pads[1]
+        eff_kernel_width = kernel_shape[0] * dilations[0]
+        layer['out_width'] = int(np.ceil((full_width - eff_kernel_width + 1) / strides[0]))
+        # for compatibility interpret some variables
         layer['pad_left'] = pads[0]
         layer['pad_right'] = pads[1]
-
-        if all(x == 0 for x in pads):  # No padding, i.e., 'VALID' padding
-            layer['padding'] = 'valid'
-        else:
-            layer['padding'] = 'same'
-
-        (layer['out_width'], _, _) = compute_padding_1d(
-            layer['padding'], layer['in_width'], layer['stride_width'], layer['filt_width']
-        )
-
-        output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_width']]
-
-    elif len(input_shapes[0]) == 4:  # Conv2D
-        layer['class_name'] = 'Conv2D'
-
-        layer['in_height'] = input_shapes[0][2]
-        layer['in_width'] = input_shapes[0][3]
-        layer['n_chan'] = input_shapes[0][1]
-
+        layer['filt_width'] = kernel_shape[0]
+        layer['stride_width'] = strides[0]
+        layer['dilation_width'] = dilations[0]
+    else:
+        # 2d
+        layer['in_height'] = input_shapes[0][-3]
+        full_height = layer['in_height'] + pads[0] + pads[2]
+        eff_kernel_height = kernel_shape[0] * dilations[0]
+        out_height = int(np.ceil((full_height - eff_kernel_height + 1) / strides[0]))
+        layer['out_height'] = out_height
+
+        full_width = input_shapes[0][-2] + pads[1] + pads[3]
+        eff_kernel_width = kernel_shape[1] * dilations[1]
+        out_width = int(np.ceil((full_width - eff_kernel_width + 1) / strides[1]))
+        layer['out_width'] = out_width
+        # for compatibility interpret some variables
+        layer['pad_top'] = pads[0]
+        layer['pad_left'] = pads[1]
+        layer['pad_bottom'] = pads[2]
+        layer['pad_right'] = pads[3]
         layer['filt_height'] = kernel_shape[0]
         layer['filt_width'] = kernel_shape[1]
-
-        layer['n_filt'] = next(
-            (x.type.tensor_type.shape.dim[1].dim_value for x in graph.value_info if x.name == node.output[0]), None
-        )
         layer['stride_height'] = strides[0]
         layer['stride_width'] = strides[1]
-        pads = compute_pads_2d(node, layer)
-
-        layer['pad_top'] = pads[0]
-        layer['pad_bottom'] = pads[2]
-        layer['pad_left'] = pads[1]
-        layer['pad_right'] = pads[3]
-
-        if all(x == 0 for x in pads):  # No padding, i.e., 'VALID' padding in Keras/Tensorflow
-            layer['padding'] = 'valid'
-        else:  # Only 'valid' and 'same' padding are available in Keras
-            layer['padding'] = 'same'
-
-        (layer['out_height'], layer['out_width'], _, _, _, _) = compute_padding_2d(
-            layer['padding'],
-            layer['in_height'],
-            layer['in_width'],
-            layer['stride_height'],
-            layer['stride_width'],
-            layer['filt_height'],
-            layer['filt_width'],
-        )
-
-        output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
+        layer['dilation_height'] = dilations[0]
+        layer['dilation_width'] = dilations[1]
 
-    return layer, output_shape
+    return layer
diff --git a/hls4ml/converters/onnx/core.py b/hls4ml/converters/onnx/core.py
index 940b860870..8ad851426d 100644
--- a/hls4ml/converters/onnx/core.py
+++ b/hls4ml/converters/onnx/core.py
@@ -1,28 +1,20 @@
-from hls4ml.converters.onnx_to_hls import get_onnx_attribute, get_onnx_input_name, onnx_handler
+import numpy as np
 
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
-@onnx_handler(*['Gemm', 'MatMul'])
-def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config):
+
+@onnx_handler('MatMul')
+def parse_matmul_layer(node, input_names, input_shapes, graph):
     layer = {}
 
-    layer['class_name'] = 'Dense'
+    layer['class_name'] = 'MatMul'
     layer['name'] = node.name
-    layer['inputs'] = get_onnx_input_name(node, graph)
-
-    tran_weight = get_onnx_attribute(node, 'transB', 0)
-    reader.add_input(layer['name'], node.input, tran_weight)
-
-    weights_shape = reader.get_weights_data(layer['name'], 'kernel').shape
-    layer['n_in'] = weights_shape[0]
-    layer['n_out'] = weights_shape[1]
-
-    output_shape = input_shapes[0][:]
-    output_shape[-1] = layer['n_out']
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
-    return layer, output_shape
+    return layer
 
 
-# ------------------Global paras for activations
 # TODO: repair HardSigmoid support
 # https://github.com/fastmachinelearning/hls4ml/issues/409
 activation_layers = [
@@ -37,7 +29,6 @@ def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config):
     'Softmax',
     'Softsign',
     'Softplus',
-    'Clip',
 ]
 
 activation_map = {
@@ -53,70 +44,79 @@ def parse_gemm_layer(reader, node, inputs_map, input_shapes, graph, config):
     'Softmax': 'Softmax',
     'Softsign': 'Activation',
     'Softplus': 'Activation',
-    'Clip': 'Clip',
 }
 # ---------
 
 
 @onnx_handler(*activation_layers)
-def parse_activation_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_activation_layer(node, input_names, input_shapes, graph):
     layer = {}
 
     layer['name'] = node.name
     layer['class_name'] = activation_map[node.op_type]
     layer['activation'] = node.op_type.lower()
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
     if layer['class_name'] != 'Activation':
         if layer['class_name'] == 'Softmax':
             layer['activation'] = 'softmax'
+            layer['axis'] = get_onnx_attribute(node, 'axis', -1)
+            # because -1 is better supported than an explicit index, check if it's the same
+            if layer['axis'] == len(input_shapes[0]) - 1:
+                layer['axis'] = -1
 
         elif layer['class_name'] in ['ELU', 'LeakyReLU', 'ThresholdedReLU']:
             layer['activation'] = layer['class_name']
             layer['activ_param'] = get_onnx_attribute(node, 'alpha', 0.01)
 
-        elif layer['class_name'] == 'Clip':
-            clip_min_node = [x for x in graph.initializer if x.name in node.input]
-            clip_min = clip_min_node[0].float_data[0]
-
-            # Check if it's relu or not
-            if clip_min == 0.0:
-                layer['class_name'] = 'Activation'
-                layer['activation'] = 'ReLU'
-            else:
-                raise Exception('Clip with min != 0 is not supported yet!')
-
         else:
             layer['activation'] = layer['class_name']
             layer['class_name'] = 'Activation'
 
-    return layer, [shape for shape in input_shapes[0]]
+    return layer
 
 
 @onnx_handler('BatchNormalization')
-def parse_batchnorm_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_batchnorm_layer(node, input_names, input_shapes, graph):
     layer = {}
 
-    layer['class_name'] = 'BatchNormalization'
-    layer['data_format'] = 'channels_first'
+    layer['class_name'] = 'BatchNormOnnx'
     layer['name'] = node.name
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
     # Other attributes
-    layer['epsilon'] = get_onnx_attribute(node, 'epsilon')
-    layer['momentum'] = get_onnx_attribute(node, 'momentum')
-
-    reader.add_input(layer['name'], node.input)
-
-    in_size = 1
-    for dim in input_shapes[0][1:]:
-        in_size *= dim
+    layer['epsilon'] = get_onnx_attribute(node, 'epsilon', 1e-05)
+    # layer['momentum'] = get_onnx_attribute(node, 'momentum', 0.9)  # not used
 
-    layer['n_in'] = layer['n_out'] = in_size
+    layer['n_in'] = layer['n_out'] = np.prod(input_shapes[0][1:])
 
     if len(input_shapes[0]) == 2:
         layer['n_filt'] = -1
     elif len(input_shapes[0]) > 2:
-        layer['n_filt'] = input_shapes[0][1]  # Always channel first for onnx
+        if node.domain != 'qonnx.custom_op.channels_last':
+            raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last")
+        layer['data_format'] = 'channels_last'  # QONNX needs to be channels-last.
+        layer['n_filt'] = input_shapes[0][-1]
+    else:
+        raise RuntimeError(f"Unexpected input shape: {input_shapes[0]}")
+
+    return layer
+
+
+@onnx_handler('Quant')
+def parse_quant_layer(node, input_names, input_shapes, graph):
+    layer = {}
+
+    layer['class_name'] = 'Quant'
+    layer['name'] = node.name
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
+
+    # Other attributes
+    layer['narrow'] = bool(get_onnx_attribute(node, 'narrow'))
+    layer['rounding_mode'] = get_onnx_attribute(node, 'rounding_mode')
+    layer['signed'] = bool(get_onnx_attribute(node, 'signed'))
 
-    return layer, [shape for shape in input_shapes[0]]
+    return layer
diff --git a/hls4ml/converters/onnx/merge.py b/hls4ml/converters/onnx/merge.py
index 9ccd432d18..420f077ec2 100644
--- a/hls4ml/converters/onnx/merge.py
+++ b/hls4ml/converters/onnx/merge.py
@@ -1,16 +1,28 @@
-from hls4ml.converters.onnx_to_hls import get_onnx_attribute, get_onnx_input_name, onnx_handler
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
-merge_layers = ['Add', 'Sub', 'Mul', 'Average', 'Max', 'Min', 'Concat', 'Sum']
+merge_layers = ['Add', 'Sub', 'Mul', 'Div', 'Average', 'Max', 'Min', 'Concat', 'Sum']
+
+op_map = {
+    'Add': 'add',
+    'Sub': 'subtract',
+    'Mul': 'multiply',
+    'Div': 'divide',
+    'Average': 'average',
+    'Max': 'maximum',
+    'Min': 'minimum',
+    'Sum': 'add',
+    'Concat': 'concat',
+}
 
 
 @onnx_handler(*merge_layers)
-def parse_merge_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_merge_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['class_name'] = node.op_type
     layer['name'] = node.name
-    layer['op'] = layer['class_name'].lower()
-    layer['inputs'] = get_onnx_input_name(node, graph)
-    output_shape = input_shapes[0]
+    layer['op'] = op_map[node.op_type]
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
     if layer['class_name'] == 'Concat':
         rank = len(input_shapes[0][1:])
@@ -21,22 +33,10 @@ def parse_merge_layer(reader, node, inputs_map, input_shapes, graph, config):
         layer['op'] = layer['class_name'].lower() + f'{rank}d'
         layer['axis'] = get_onnx_attribute(node, 'axis')
 
-        # Calculate output shape
-        new_dim = sum(
-            [x.type.tensor_type.shape.dim[layer['axis']].dim_value for x in graph.value_info if x.name in node.input]
-        )
-        output_shape[layer['axis']] = new_dim
-
-    elif layer['class_name'] == 'Add':
-        # Check if the layer is an AddBias
-        for input in node.input:
-            if "bias" in input:
-                layer['class_name'] = 'BiasAdd'
-                reader.add_input(layer['name'], node.input)
     else:
         layer['class_name'] = 'Merge'
 
     if len(layer['inputs']) > 2:
         raise Exception('ERROR: Merging more than two tensors is not yet supported.')
 
-    return layer, output_shape
+    return layer
diff --git a/hls4ml/converters/onnx/pooling.py b/hls4ml/converters/onnx/pooling.py
index 67fa76c7c7..1f5c431004 100644
--- a/hls4ml/converters/onnx/pooling.py
+++ b/hls4ml/converters/onnx/pooling.py
@@ -1,26 +1,30 @@
-from hls4ml.converters.onnx_to_hls import (
-    compute_pads_1d,
-    compute_pads_2d,
-    get_onnx_attribute,
-    get_onnx_input_name,
-    onnx_handler,
-)
-from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d
+import numpy as np
+
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
 pool_operations = ['AveragePool', 'MaxPool']
 
 
 @onnx_handler(*pool_operations)
-def parse_pool_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_pool_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['name'] = node.name
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
+    if node.domain != 'qonnx.custom_op.channels_last':
+        raise RuntimeError("Please convert the model to channels-last format with qonnx-to-channels-last")
     layer['class_name'] = node.op_type
-    layer['data_format'] = 'channels_first'  # Default ONNX
+    layer['data_format'] = 'channels_last'  # Default QONNX
 
     info = layer['class_name'].replace('Pool', '')
     strides = get_onnx_attribute(node, 'strides')
     kernel_shape = get_onnx_attribute(node, 'kernel_shape')
+    pads = get_onnx_attribute(node, 'pads')
+    layer['pads'] = pads
+    dilations = get_onnx_attribute(node, 'dilations')
+    if dilations is None:
+        dilations = [1] * len(kernel_shape)
+    layer['dilations'] = dilations
 
     if len(input_shapes[0]) == 3:  # 1D
         layer['class_name'] = info + 'Pooling1D'
@@ -31,70 +35,50 @@ def parse_pool_layer(reader, node, inputs_map, input_shapes, graph, config):
         layer['pool_width'] = kernel_shape[0]
         layer['stride_width'] = strides[0]
 
-        # Padding
-        pads = compute_pads_1d(node, layer)
-        layer['pad_left'] = pads[0]
-        layer['pad_right'] = pads[1]
-
-        if all(x == 0 for x in pads):  # No padding, i.e., 'VALID' padding
-            layer['padding'] = 'valid'
-        else:
-            layer['padding'] = 'same'
-
-        (layer['n_out'], _, _) = compute_padding_1d(
-            layer['padding'], layer['n_in'], layer['stride_width'], layer['pool_width']
+        # formula from ONNX Operators.md documentation
+        layer['n_out'] = int(
+            np.floor((layer['n_in'] + np.sum(pads) - ((kernel_shape[0] - 1) * dilations[0] + 1)) / strides[0] + 1)
         )
 
-        output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']]
-
     elif len(input_shapes[0]) == 4:  # 2D
         layer['class_name'] = info + 'Pooling2D'
 
-        layer['n_filt'] = input_shapes[0][1]
-        layer['in_height'] = input_shapes[0][2]
-        layer['in_width'] = input_shapes[0][3]
+        layer['n_filt'] = input_shapes[0][3]
+        layer['in_height'] = input_shapes[0][1]
+        layer['in_width'] = input_shapes[0][2]
 
         layer['stride_height'] = strides[0]
         layer['stride_width'] = strides[1]
         layer['pool_height'] = layer['filt_height'] = kernel_shape[0]
         layer['pool_width'] = layer['filt_width'] = kernel_shape[1]
 
-        pads = compute_pads_2d(node, layer)
         layer['pad_top'] = pads[0]
         layer['pad_bottom'] = pads[2]
         layer['pad_left'] = pads[1]
         layer['pad_right'] = pads[3]
 
-        if all(x == 0 for x in pads):  # No padding, i.e., 'VALID' padding in Keras/Tensorflow
-            layer['padding'] = 'valid'
-        else:  # Only 'valid' and 'same' padding are available in Keras
-            layer['padding'] = 'same'
-
-        (layer['out_height'], layer['out_width'], _, _, _, _) = compute_padding_2d(
-            layer['padding'],
-            layer['in_height'],
-            layer['in_width'],
-            layer['stride_height'],
-            layer['stride_width'],
-            layer['filt_height'],
-            layer['filt_width'],
+        # formula from ONNX Operators.md documentation
+        layer['out_height'] = int(
+            np.floor((layer['in_height'] + pads[0] + pads[2] - ((kernel_shape[0] - 1) * dilations[0] + 1)) / strides[0] + 1)
+        )
+        layer['out_width'] = int(
+            np.floor((layer['in_width'] + pads[1] + pads[3] - ((kernel_shape[1] - 1) * dilations[1] + 1)) / strides[1] + 1)
         )
 
-        output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
-
-    return layer, output_shape
+    return layer
 
 
 global_pooling_layers = ['GlobalMaxPool', 'GlobalAveragePool']
 
 
 @onnx_handler(*global_pooling_layers)
-def parse_global_pooling_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_global_pooling_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['name'] = node.name
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
     layer['class_name'] = node.op_type
-    layer['data_format'] = 'channels_first'
+    layer['data_format'] = 'channels_last'  # default QONNX
 
     # Sonme default parameters for global pooling
     layer['n_out'] = 1
@@ -116,6 +100,4 @@ def parse_global_pooling_layer(reader, node, inputs_map, input_shapes, graph, co
         layer['in_height'] = input_shapes[0][2]
         layer['in_width'] = input_shapes[0][3]
 
-    output_shape = [input_shapes[0][0], layer['n_filt']] + [1] * (len(input_shapes[0]) - 2)
-
-    return layer, output_shape
+    return layer
diff --git a/hls4ml/converters/onnx/reshape.py b/hls4ml/converters/onnx/reshape.py
index 5bbf58b079..f11796b6db 100644
--- a/hls4ml/converters/onnx/reshape.py
+++ b/hls4ml/converters/onnx/reshape.py
@@ -1,39 +1,60 @@
-import numpy as np
-
-from hls4ml.converters.onnx_to_hls import get_onnx_input_name, onnx_handler
+from hls4ml.converters.onnx_to_hls import get_onnx_attribute, onnx_handler
 
 
 @onnx_handler('Transpose')
-def parse_transpose_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_transpose_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['name'] = node.name
     layer['class_name'] = 'Transpose'
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
     perm = [list(i.ints) for i in node.attribute][0]  # This will get something like [[a,b,c]][0] = [a,b,c]
     layer['perm'] = [x - 1 for x in perm[1:]]  # Ignore the batch dimension in ONNX, and adjust the perm indexing
 
-    output_shape = [input_shapes[0][i] for i in perm]
-
-    return layer, output_shape
+    return layer
 
 
 @onnx_handler('Reshape')
-def parse_reshape_layer(reader, node, inputs_map, input_shapes, graph, config):
+def parse_reshape_layer(node, input_names, input_shapes, graph):
     layer = {}
     layer['name'] = node.name
     layer['class_name'] = 'Reshape'
-    layer['inputs'] = get_onnx_input_name(node, graph)
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
 
-    target_shape = list([x for x in graph.initializer if x.name == node.input[1]][0].int64_data)[1:]
+    return layer
 
-    if -1 in target_shape:  # Need to infer shape for -1
-        print("WARNING: Inferring -1 shape ... ")
-        dummy_x = np.ones(input_shapes[0][1:])
-        dummy_y = np.reshape(dummy_x, target_shape)
-        target_shape = list(dummy_y.shape)
 
-    layer['target_shape'] = target_shape
-    output_shape = input_shapes[0][:1] + layer['target_shape']
+@onnx_handler('Flatten')
+def parse_flatten_layer(node, input_names, input_shapes, graph):
+    layer = {}
+    layer['name'] = node.name
+    layer['class_name'] = 'Reshape'
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
+    layer['target_shape'] = [-1]  # does not contain batch dimension
 
-    return layer, output_shape
+    return layer
+
+
+@onnx_handler('Resize')
+def parse_resize_layer(node, input_names, input_shapes, graph):
+    layer = {}
+    layer['name'] = node.name
+    layer['class_name'] = 'Resize'
+    layer['inputs'] = input_names
+    layer['outputs'] = list(node.output)
+    layer['in_height'] = input_shapes[0][2]
+    layer['in_width'] = input_shapes[0][1]
+    layer['out_width'] = input_shapes[0][1]
+    layer['out_height'] = input_shapes[0][2]
+    layer['n_chan'] = input_shapes[0][3]
+    layer['algorithm'] = get_onnx_attribute(node, 'mode')
+    # The following is used in initialize() method.
+    # Probably a better solution would be to have a channels last parameter at QONNX level
+    layer['data_format'] = (
+        'channels_last' if any(node.domain == 'qonnx.custom_op.channels_last' for node in graph.node) else 'channels_first'
+    )
+
+    return layer
diff --git a/hls4ml/converters/onnx_to_hls.py b/hls4ml/converters/onnx_to_hls.py
index 106daf62da..75850fa93e 100644
--- a/hls4ml/converters/onnx_to_hls.py
+++ b/hls4ml/converters/onnx_to_hls.py
@@ -1,78 +1,10 @@
-import numpy as np
 import onnx
-from onnx import helper, numpy_helper, shape_inference
+from onnx import helper, numpy_helper
 
 from hls4ml.model import ModelGraph
 
-MAXMULT = 4096
 
-
-class ONNXDataReader:
-    """
-    ONNX data reader to be used for extracting relevant information during conversion.
-    """
-
-    def __init__(self, model):
-        self.model = model
-        self.input_map = {}
-        self.index_map = {
-            # Dense
-            'kernel': 1,
-            'bias': 2,
-            # BatchNormalization
-            'gamma': 1,
-            'beta': 2,
-            'moving_mean': 3,
-            'moving_variance': 4,
-        }
-
-    def get_weights_data(self, layer_name, var_name):
-        """Extract weights data from ONNX model.
-
-        Args:
-            layer_name (str): Layer's name in the ONNX model.
-            var_name (str): Variable to be extracted.
-
-        Returns:
-            ndarray: Extracted weights data.
-        """
-        # Get the node associated with the layer name
-        node = next(node for node in self.model.graph.node if node.name == layer_name)
-
-        inputs = self.input_map[layer_name]
-        inp_idx = self.index_map[var_name]
-
-        if inp_idx >= len(inputs['inputs']):
-            # Check if the layer is an AddBias layer
-            if (node.op_type == 'Add') and (var_name == 'bias'):
-                inp_idx = 1
-            else:
-                # Input not found, likely a bias tensor is not available
-                return None
-
-        tensor = next((x for x in self.model.graph.initializer if x.name == inputs['inputs'][inp_idx]), None)
-
-        if tensor is not None:
-            data = numpy_helper.to_array(tensor)
-
-            if inputs['transpose']:
-                if inputs['perm'] is not None and len(data.shape) == len(inputs['perm']):
-                    data = data.transpose(inputs['perm'])
-                else:
-                    data = data.transpose()
-
-            # Check for transB in Gemm
-            if node.op_type == 'Gemm':
-                if not get_onnx_attribute(node, 'transB'):
-                    data = data.transpose()
-
-        return data
-
-    def add_input(self, layer_name, inputs, transpose=True, perm=None):
-        self.input_map[layer_name] = {'inputs': inputs, 'transpose': transpose, 'perm': perm}
-
-
-# ----------------------Helpers--------------------- #
+# ----------------------Helpers---------------------
 def sanitize_layer_name(layer):
     new_name = layer['name']
     if new_name[0].isdigit():
@@ -99,9 +31,52 @@ def get_onnx_attribute(operation, name, default=None):
     return value
 
 
-def get_input_shape(model, operation, input_idx=0):
-    value_info_idx = next((i for i, x in enumerate(model.graph.value_info) if x.name == operation.input[input_idx]), 0)
-    return [d.dim_value for d in model.graph.value_info[value_info_idx].type.tensor_type.shape.dim]
+def get_global_input_shape(graph, inp):
+    """Return the global input shape of the graph with name inp
+
+    Arguments:
+        graph:  the onnx graph
+        inp (str):  the global input name
+
+    Returns:
+        list: The shape
+
+    Raises:
+        StopIteration:  If the global input name is not found
+    """
+    inp_shape = next(x.type.tensor_type.shape.dim for x in graph.input if x.name == inp)
+    return list(x.dim_value for x in inp_shape)
+
+
+def get_input_shape(graph, node):
+    """Return the input shapes of the node in the model
+
+    Arguments:
+        graph:  the onnx graph
+        node:  the onnx node for which the input is desired
+
+    Returns:
+        list of lists: The shapes of all the inputs
+
+    Raises:
+        StopIteration:  If the an input name is not found in the graph
+    """
+    rv = []
+    for inp in node.input:
+        try:
+            value_info_idx = next((i for i, x in enumerate(graph.value_info) if x.name == inp))
+            dim = list(d.dim_value for d in graph.value_info[value_info_idx].type.tensor_type.shape.dim)
+        except StopIteration:
+            # The input is not in the graph, likely it's the input
+            dim = get_global_input_shape(graph, inp)
+        if dim:
+            rv.append(dim)
+    return rv
+
+
+def get_constant_value(graph, constant_name):
+    tensor = next((x for x in graph.initializer if x.name == constant_name), None)
+    return numpy_helper.to_array(tensor)
 
 
 def compute_pads_1d(operation, layer):
@@ -155,7 +130,7 @@ def compute_pads_2d(operation, layer):
     return pads
 
 
-# ----------------------Layer handling--------------------- #
+# ----------------------Layer handling---------------------
 layer_handlers = {}
 
 
@@ -178,27 +153,6 @@ def decorator(function):
     return decorator
 
 
-# --->> A set of functions to address the naming convetion in ONNx's graph
-def get_onnx_input_name(node, graph):
-    """
-    In ONNX, when calling node.input, it returns the node input's index in the graph instead of the input's name.
-    However, the input's name is used for indexing in ModelGraph's graph. This function return the input node's name instead.
-    """
-
-    in_node = [in_node for in_node in graph.node if (in_node.output[0] in node.input)]
-
-    if in_node:
-        if in_node[0].op_type != 'Flatten':
-            input_node_name = [x.name for x in in_node]
-        else:  # IF it's a flatten
-            input_node_name = [x.name for x in graph.node if (x.output[0] in in_node[0].input)]
-
-        return input_node_name
-
-    else:  # If there is no input name it's actually the first layer
-        return [replace_char_inconsitency(node.input[0])]
-
-
 def get_out_layer_name(graph):
     """
     Get the output layer's name for the model.
@@ -208,36 +162,31 @@ def get_out_layer_name(graph):
     return [node.name for node in graph.node if node.output[0] in output_index_list]
 
 
-def onnx_to_hls(config):
-    """Convert onnx model to hls model from configuration.
+def parse_onnx_model(onnx_model):
+    """Parses the onnx model, both for configuration building and general processing.
 
     Args:
-        config (dict): ONNX configuration from yaml file or passed through API.
+        onnx_model: an ONNX model object.
 
     Raises:
         Exception: Raised if an unsupported operation is found in the ONNX model.
 
     Returns:
-        ModelGraph: hls4ml model object
+        layer_list (list):  The onnx layers
+        input_layers (list):  The input layers
+        output_layers (list):  The output layers
     """
     # This is a list of dictionaries to hold all the layer info we need to generate HLS
     layer_list = []
 
-    # Extract model architecture
-    print('Interpreting Model ...')
-
-    model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel']
-
-    model = shape_inference.infer_shapes(model)
-    graph = model.graph
-
-    reader = ONNXDataReader(model)
+    # We don't infer the shapes because the qonnx package preprocessing does it.
 
     # Obtain list of input/ouput layers
-    all_inputs = [x.name for x in model.graph.input]
-    all_initializers = [x.name for x in model.graph.initializer]
+    all_inputs = [x.name for x in onnx_model.graph.input]
+    all_initializers = [x.name for x in onnx_model.graph.initializer]
     input_layers = [x for x in all_inputs if x not in all_initializers]
-    output_layers = get_out_layer_name(graph)
+    constant_layers = all_initializers  # no need to copy it even though we change it
+    output_layers = get_out_layer_name(onnx_model.graph)
 
     print("Output layers: ", output_layers)
 
@@ -245,75 +194,93 @@ def onnx_to_hls(config):
         input_layer = {}
         input_layer['name'] = replace_char_inconsitency(inp)
         input_layer['class_name'] = 'InputLayer'
-        inp_shape = next((x.type.tensor_type.shape.dim for x in model.graph.input if x.name == inp), None)
-        input_layer['input_shape'] = [x.dim_value for x in inp_shape]
-
-        if len(input_layer['input_shape']) > 1:
-            input_layer['input_shape'][0] = None  # Firt dim is batch
+        inp_shape = get_global_input_shape(onnx_model.graph, inp)
+        # We only support ONNX where the first dimension is the batch dimension.
+        # Remove the batch dimension in all subsequnt use
+        input_layer['input_shape'] = inp_shape[1:]
 
+        print('Input shape:', input_layer['input_shape'])
         # Clean the layer name for specific models
         sanitize_layer_name(input_layer)
         input_layers[i] = input_layer['name']
 
         layer_list.append(input_layer)
 
+    for i, constant in enumerate(constant_layers):
+        constant_layer = {}
+        constant_layer['name'] = replace_char_inconsitency(constant)
+        constant_layer['class_name'] = 'Constant'
+        constant_layer['value'] = get_constant_value(onnx_model.graph, constant)
+
+        # Clean the layer name for specific models
+        sanitize_layer_name(constant_layer)
+        constant_layers[i] = constant_layer['name']
+
+        layer_list.append(constant_layer)
+
     # Defined supported layers and check for unsupported layer type
-    skip_layers = ['Dropout', 'Identity', 'Flatten']
+    skip_layers = ['Dropout', 'Identity']
 
     # Map inputs of skipped layers
     inputs_map = {}
 
     supported_layers = get_supported_onnx_layers() + skip_layers
 
-    # Get input shape
-    current_shape = [input_layer['input_shape']]
-    print('Input shape:', current_shape[0])
-
-    # Loop through layers
-    layer_counter = 0
-
-    # Output shape tracking
-    output_shape = None
-
     print('Topology:')
-    for node in graph.node:
+    for node in onnx_model.graph.node:
         if node.op_type not in supported_layers:
             raise Exception(f'ERROR: Unsupported operation type: {node.op_type}')
 
-        # If not the first layer then input shape is taken from last layer's output
-        if layer_counter != 0:
-            current_shape = [output_shape]
+        # Note that at this point, input shape still contains batch dimension
+        # in cases where it appears. That is not filtered out till later.
+        input_shapes = get_input_shape(onnx_model.graph, node)
 
         if node.op_type in skip_layers:
-            if node.op_type == 'Flatten':
-                output_shape = [current_shape[0][0], np.prod(current_shape[0][1:])]
-
-            else:
-                # Currently supported skipped layers have only one input and output
-                # Skipped layers can follow each other (e.g., Dropout -> Flatten)
-
-                # Mapping inputs
-                input_name = inputs_map.get(node.input[0], node.input[0])
-                output_name = node.output[0]
-                inputs_map[output_name] = input_name
+            # Currently supported skipped layers have only one input and output
+            # Skipped layers can follow each other
 
-                output_shape = current_shape[0]
+            # Mapping inputs
+            input_name = inputs_map.get(node.input[0], node.input[0])
+            output_name = node.output[0]
+            inputs_map[output_name] = input_name
             continue
 
-        if node.op_type in supported_layers:
-            layer_counter = layer_counter + 1
+        input_names = [inputs_map.get(x, x) for x in node.input]
 
         # Process the layer
-        layer, output_shape = layer_handlers[node.op_type](reader, node, inputs_map, current_shape, graph, config)
+        layer = layer_handlers[node.op_type](node, input_names, input_shapes, onnx_model.graph)
 
         sanitize_layer_name(layer)
-        print('Layer name: {}, layer type: {}, current shape: {}'.format(layer['name'], layer['class_name'], current_shape))
+        print(f"Layer name: {layer['name']}, layer type: {layer['class_name']}, current shape: {input_shapes}")
         layer_list.append(layer)
 
+    return layer_list, input_layers, output_layers
+
+
+def onnx_to_hls(config):
+    """Convert onnx model to hls model from configuration.
+
+    Args:
+        config (dict): ONNX configuration from yaml file or passed through API.
+
+    Raises:
+        Exception: Raised if an unsupported operation is found in the ONNX model.
+
+    Returns:
+        ModelGraph: hls4ml model object
+    """
+
+    # Extract model architecture
+    print('Interpreting Model ...')
+
+    onnx_model = onnx.load(config['OnnxModel']) if isinstance(config['OnnxModel'], str) else config['OnnxModel']
+
+    layer_list, input_layers, output_layers = parse_onnx_model(onnx_model)
+
     #################
     # Generate HLS
     #################
 
     print('Creating HLS model')
-    hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
+    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
     return hls_model
diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py
index c56857715a..2c05b7501f 100644
--- a/hls4ml/converters/pytorch/core.py
+++ b/hls4ml/converters/pytorch/core.py
@@ -62,9 +62,13 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod
             layer['activation'] = 'ThresholdedReLU'
             if layer['activ_param'] < 0:
                 raise Exception('negative threshold values not supported')
-
-        if hasattr(node, 'dim'):
+        if hasattr(class_object, 'dim'):
             layer['axis'] = class_object.dim
+            if layer['class_name'] == 'Softmax' and layer['axis'] is None:
+                layer['axis'] = -1
+            if 'IOType' in config:
+                if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1:
+                    raise Exception('dim needs to be -1 for io_stream')
     else:
         if layer['class_name'] in ['ReLU', 'Sigmoid', 'Tanh']:
             layer['class_name'] = 'Activation'
@@ -80,6 +84,11 @@ def parse_activation_layer(operation, layer_name, input_names, input_shapes, nod
             layer['activation'] = 'ThresholdedReLU'
         if 'dim' in node.kwargs:
             layer['axis'] = node.kwargs['dim']
+            if layer['class_name'] == 'Softmax' and layer['axis'] is None:
+                layer['axis'] = -1
+            if 'IOType' in config:
+                if layer['class_name'] == 'Softmax' and config['IOType'] == 'io_stream' and layer['axis'] != -1:
+                    raise Exception('dim needs to be -1 for io_stream')
 
     output_shape = input_shapes[0]
     return layer, output_shape
diff --git a/hls4ml/converters/pytorch/pooling.py b/hls4ml/converters/pytorch/pooling.py
index 8256a9ff87..3757b2c82e 100644
--- a/hls4ml/converters/pytorch/pooling.py
+++ b/hls4ml/converters/pytorch/pooling.py
@@ -90,15 +90,19 @@ def parse_pooling_layer(operation, layer_name, input_names, input_shapes, node,
                 layer['stride_height'] = node.kwargs['stride'][0]
                 layer['stride_width'] = node.kwargs['stride'][1]
             else:
-                layer['stride_height'] = node.kwargs['stride']
-                layer['stride_width'] = node.kwargs['stride']
-            if type(node.kwargs['kernel_size']) is tuple:
-                layer['pool_height'] = node.kwargs['kernel_size'][0]
-                layer['pool_width'] = node.kwargs['kernel_size'][1]
+                if node.kwargs['stride'] is None:
+                    # if stride is not set it is supposed to default to the kernel size
+                    layer['stride_height'] = node.args[1]
+                    layer['stride_width'] = node.args[1]
+                else:
+                    layer['stride_height'] = node.kwargs['stride']
+                    layer['stride_width'] = node.kwargs['stride']
+            if type(node.args[1]) is tuple:
+                layer['pool_height'] = node.args[1][0]
+                layer['pool_width'] = node.args[1][1]
             else:
-                layer['pool_height'] = node.kwargs['kernel_size']
-                layer['pool_width'] = node.kwargs['kernel_size']
-
+                layer['pool_height'] = node.args[1]
+                layer['pool_width'] = node.args[1]
             if type(node.kwargs['padding']) is tuple:
                 padding = node.kwargs['padding']
             else:
diff --git a/hls4ml/converters/pytorch/reshape.py b/hls4ml/converters/pytorch/reshape.py
index 37191135a1..3d415e7832 100644
--- a/hls4ml/converters/pytorch/reshape.py
+++ b/hls4ml/converters/pytorch/reshape.py
@@ -93,13 +93,23 @@ def parse_flatten_layer(operation, layer_name, input_names, input_shapes, node,
     layer['class_name'] = 'Reshape'
     layer['name'] = layer_name
     layer['inputs'] = input_names
-
-    start_dim = class_object.start_dim
-    end_dim = class_object.end_dim
-    if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
-        end_dim = len(input_shapes[0])
+    if node.op == 'call_module':
+        start_dim = class_object.start_dim
+        end_dim = class_object.end_dim
+        if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
+            end_dim = len(input_shapes[0])
+        else:
+            end_dim = end_dim + 1
     else:
-        end_dim = end_dim + 1
+        start_dim = node.args[1]
+        if len(node.args) == 3:
+            end_dim = node.args[2]
+        else:
+            end_dim = -1
+        if end_dim + 1 == 0 or end_dim + 1 > len(input_shapes[0]):
+            end_dim = len(input_shapes[0])
+        else:
+            end_dim = end_dim + 1
 
     layer['target_shape'] = (
         input_shapes[0][0:start_dim] + [np.prod(input_shapes[0][start_dim:end_dim])] + input_shapes[0][end_dim:]
diff --git a/hls4ml/converters/utils.py b/hls4ml/converters/utils.py
index d1c9e050d5..f365916b55 100644
--- a/hls4ml/converters/utils.py
+++ b/hls4ml/converters/utils.py
@@ -45,7 +45,7 @@ def compute_padding_1d(pad_type, in_size, stride, filt_size):
     is odd, it will add the extra column to the right.
 
     Args:
-        pad_type (str): Padding type, one of ``same``, `valid`` or ``causal`` (case insensitive).
+        pad_type (str): Padding type, one of ``same``, ``valid`` or ``causal`` (case insensitive).
         in_size (int): Input size.
         stride (int): Stride length.
         filt_size (int): Length of the kernel window.
@@ -135,6 +135,23 @@ def compute_padding_2d(pad_type, in_height, in_width, stride_height, stride_widt
 
 
 def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
+    """Computes the amount of padding required on each side of the 1D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right, but if the amount of columns to be added
+    is odd, it will add the extra column to the right.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same``, ``valid`` or ``causal`` (case insensitive).
+        in_size (int): Input size.
+        stride (int): Stride length.
+        filt_size (int): Length of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input size, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             n_out = int(
@@ -176,6 +193,26 @@ def compute_padding_1d_pytorch(pad_type, in_size, stride, filt_size, dilation):
 def compute_padding_2d_pytorch(
     pad_type, in_height, in_width, stride_height, stride_width, filt_height, filt_width, dilation_height, dilation_width
 ):
+    """Computes the amount of padding required on each side of the 2D input tensor following pytorch conventions.
+
+    In case of ``same`` padding, this routine tries to pad evenly left and right (top and bottom), but if the amount of
+    columns to be added is odd, it will add the extra column to the right/bottom.
+
+    Args:
+        pad_type (str or int): Padding type. If string, one of ``same`` or ``valid`` (case insensitive).
+        in_height (int): The height of the input tensor.
+        in_width (int): The width of the input tensor.
+        stride_height (int): Stride height.
+        stride_width (int): Stride width.
+        filt_height (int): Height of the kernel window.
+        filt_width (int): Width of the kernel window.
+
+    Raises:
+        Exception: Raised if the padding type is unknown.
+
+    Returns:
+        tuple: Tuple containing the padded input height, width, and top, bottom, left and right padding values.
+    """
     if isinstance(pad_type, str):
         if pad_type.lower() == 'same':
             # Height
diff --git a/hls4ml/model/attributes.py b/hls4ml/model/attributes.py
index 0e8df6e10a..d03d2bd108 100644
--- a/hls4ml/model/attributes.py
+++ b/hls4ml/model/attributes.py
@@ -36,11 +36,12 @@ class Attribute:
 
     """
 
-    def __init__(self, name, value_type=Integral, default=None, configurable=False):
+    def __init__(self, name, value_type=Integral, default=None, configurable=False, description=None):
         self.name = name
         self.value_type = value_type
         self.default = default
         self.configurable = configurable
+        self.description = description
 
     def validate_value(self, value):
         if self.value_type is not None:
@@ -59,6 +60,20 @@ def config_name(self):
         """
         return convert_to_pascal_case(self.name)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Attribute):
+            return NotImplemented
+        return (
+            self.name == other.name
+            and self.value_type == other.value_type
+            and self.default == other.default
+            and self.configurable == other.configurable
+            and self.description == other.description
+        )
+
+    def __hash__(self) -> int:
+        return hash((self.name, self.value_type, self.default, self.configurable, self.description))
+
 
 class ConfigurableAttribute(Attribute):
     """
@@ -68,8 +83,8 @@ class ConfigurableAttribute(Attribute):
     when defining the expected attributes of layer classes.
     """
 
-    def __init__(self, name, value_type=int, default=None):
-        super().__init__(name, value_type, default, configurable=True)
+    def __init__(self, name, value_type=Integral, default=None, description=None):
+        super().__init__(name, value_type, default, configurable=True, description=description)
 
 
 class TypeAttribute(Attribute):
@@ -79,10 +94,10 @@ class TypeAttribute(Attribute):
     As a convention, the name of the attribute storing a type will end in ``_t``.
     """
 
-    def __init__(self, name, default=None, configurable=True):
+    def __init__(self, name, default=None, configurable=True, description=None):
         if not name.endswith('_t'):
             name += '_t'
-        super().__init__(name, value_type=NamedType, default=default, configurable=configurable)
+        super().__init__(name, value_type=NamedType, default=default, configurable=configurable, description=description)
 
 
 class ChoiceAttribute(Attribute):
@@ -90,25 +105,31 @@ class ChoiceAttribute(Attribute):
     Represents an attribute whose value can be one of several predefined values.
     """
 
-    def __init__(self, name, choices, default=None, configurable=True):
-        super().__init__(name, value_type=list, default=default, configurable=configurable)
+    def __init__(self, name, choices, default=None, configurable=True, description=None):
+        super().__init__(name, value_type=list, default=default, configurable=configurable, description=description)
         assert len(choices) > 0
         if default is not None:
             assert default in choices
         self.choices = choices
-        self.value_type = str(self.choices)
 
     def validate_value(self, value):
         return value in self.choices
 
+    def __eq__(self, other: object) -> bool:
+        base_eq = super().__eq__(other)
+        return base_eq and hasattr(other, 'choices') and set(self.choices) == set(other.choices)
+
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash(tuple(sorted(self.choices)))
+
 
 class WeightAttribute(Attribute):
     """
     Represents an attribute that will store a weight variable.
     """
 
-    def __init__(self, name):
-        super().__init__(name, value_type=WeightVariable, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super().__init__(name, value_type=WeightVariable, default=None, configurable=False, description=description)
 
 
 class CodeAttrubute(Attribute):
@@ -116,8 +137,8 @@ class CodeAttrubute(Attribute):
     Represents an attribute that will store generated source code block.
     """
 
-    def __init__(self, name):
-        super(WeightAttribute, self).__init__(name, value_type=Source, default=None, configurable=False)
+    def __init__(self, name, description=None):
+        super().__init__(name, value_type=Source, default=None, configurable=False, description=description)
 
 
 # endregion
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index e9c69eae4e..520f96ba5f 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -10,6 +10,7 @@
 from hls4ml.model.flow import get_flow
 from hls4ml.model.layers import layer_map
 from hls4ml.model.optimizer import get_available_passes, optimize_model
+from hls4ml.utils.string_utils import convert_to_snake_case
 
 
 class HLSConfig:
@@ -35,7 +36,7 @@ def __init__(self, config):
         self.layer_type_targ_cycles = {}
         self.layer_name_targ_cycles = {}
 
-        self.model_strategy = 'Latency'
+        self.model_strategy = convert_to_snake_case('Latency')
         self.layer_type_strategy = {}
         self.layer_name_strategy = {}
 
@@ -49,7 +50,8 @@ def __init__(self, config):
 
         self.trace_output = self.get_config_value('TraceOutput', False)
 
-        self.pipeline_style = 'pipeline'
+        self.pipeline_style = 'auto'
+        self.pipeline_ii = None
 
         if 'WriterConfig' in self.config:
             self.writer_config = self.config['WriterConfig']
@@ -61,7 +63,6 @@ def __init__(self, config):
             }
 
         self._parse_hls_config()
-        self._validate_hls_config()
 
     def get_config_value(self, key, default=None):
         return self.config.get(key, default)
@@ -120,7 +121,8 @@ def get_precision(self, layer, var='default'):
         type_name = layer.name.lower() + '_' + var + '_t'
         if precision is None:
             precision = self.layer_name_precision.get(layer.name.lower() + '_default')
-            type_name = layer.name.lower() + '_default_t'
+            # I think it is better to keep these unique still to avoid inadvertent updates
+            # type_name = layer.name.lower() + '_default_t'
 
         if precision is None:
             precision = self.layer_type_precision.get(layer.class_name.lower() + '_' + var)
@@ -217,7 +219,7 @@ def parse_name_config(self, layer_name, layer_cfg):
 
         strategy = layer_cfg.get('Strategy')
         if strategy is not None:
-            self.layer_name_strategy[layer_name.lower()] = strategy
+            self.layer_name_strategy[layer_name.lower()] = convert_to_snake_case(strategy)
 
         conv_implementation = layer_cfg.get('ConvImplementation')
         if conv_implementation is not None:
@@ -265,9 +267,10 @@ def _parse_hls_config(self):
             self.model_rf = model_cfg.get('ReuseFactor')
             self.model_targ_cycles = model_cfg.get('TargetCycles')
             self.model_conv_implementation = model_cfg.get('ConvImplementation', 'LineBuffer')
-            self.model_strategy = model_cfg.get('Strategy', 'Latency')
+            self.model_strategy = convert_to_snake_case(model_cfg.get('Strategy', 'Latency'))
             self.model_compression = bool(model_cfg.get('Compression', 0))
-            self.pipeline_style = model_cfg.get('PipelineStyle', 'pipeline')
+            self.pipeline_style = model_cfg.get('PipelineStyle', 'auto')
+            self.pipeline_ii = model_cfg.get('PipelineInterval', None)
 
         layer_type_cfg = hls_config.get('LayerType')
         if layer_type_cfg is not None:
@@ -289,7 +292,7 @@ def _parse_hls_config(self):
 
                 strategy = layer_cfg.get('Strategy')
                 if strategy is not None:
-                    self.layer_type_strategy[layer_type.lower()] = strategy
+                    self.layer_type_strategy[layer_type.lower()] = convert_to_snake_case(strategy)
 
                 conv_implementation = layer_cfg.get('ConvImplementation')
                 if conv_implementation is not None:
@@ -304,50 +307,6 @@ def _parse_hls_config(self):
             for layer_name, layer_cfg in layer_name_cfg.items():
                 self.parse_name_config(layer_name, layer_cfg)
 
-    def _validate_hls_config(self):
-        use_dataflow = False
-        if self.pipeline_style.lower() == 'pipeline' and self.model_compression:
-            print('WARNING: Compression enabled while pipeline style set to "pipeline".')
-            use_dataflow = True
-        for layer_type, strategy in self.layer_type_strategy.items():
-            if strategy.lower() == 'resource' and self.pipeline_style.lower() == 'pipeline':
-                print(
-                    'WARNING: Strategy for layer type {} set to "Resource", while pipeline style set to "pipeline".'.format(
-                        layer_type
-                    )
-                )
-                use_dataflow = True
-
-        for layer_name, strategy in self.layer_name_strategy.items():
-            if strategy.lower() == 'resource' and self.pipeline_style.lower() == 'pipeline':
-                print(
-                    'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format(
-                        layer_name
-                    )
-                )
-                use_dataflow = True
-
-        for layer_type, compression in self.layer_type_compression.items():
-            if compression and self.pipeline_style.lower() == 'pipeline':
-                print(
-                    'WARNING: Compression enabled for layer type {}, while pipeline style set to "pipeline".'.format(
-                        layer_type
-                    )
-                )
-                use_dataflow = True
-
-        for layer_name, compression in self.layer_name_compression.items():
-            if compression and self.pipeline_style.lower() == 'pipeline':
-                print(f'WARNING: Compression enabled for layer {layer_name}, while pipeline style set to "pipeline".')
-                use_dataflow = True
-
-        if self.model_strategy.lower() == 'resource':
-            use_dataflow = True
-
-        if use_dataflow:
-            print('WARNING: Changing pipeline style to "dataflow".')
-            self.pipeline_style = 'dataflow'
-
 
 class ModelGraph:
     """The ModelGraph represents the network that is being processed by hls4ml.
@@ -547,6 +506,8 @@ def insert_node(self, node, before=None, input_idx=0):
 
         if next_node is not None:
             next_node.inputs[input_idx] = node.outputs[0]
+        else:
+            self.outputs = [node.outputs[0] if name == prev_node.outputs[0] else name for name in self.outputs]
 
         new_graph = OrderedDict()
         for k, v in self.graph.items():
@@ -555,47 +516,57 @@ def insert_node(self, node, before=None, input_idx=0):
                 new_graph[node.name] = node
 
         self.graph = new_graph
-        self._update_model_outputs()
 
     def remove_node(self, node, rewire=True):
-        """Remove a node from a graph.
+        """Removes a node from the graph.
 
-        By default, this function can connect the outputs of previous node to the input of next one.
-        Note that when removing a leaf node `rewire` should be set to `False`.
+        By default, this function connects the outputs of the previous
+        node to the inputs of the next node. If the removed node has multiple
+        input/output tensors, an exception is raised.
 
         Args:
-            node (Layer): The node to remove
-            rewire (bool, optional): If `True`, connects the outputs of the previous node
-                to the inputs of the next node
+            node (Layer): The node to remove.
+            rewire (bool, optional): Deprecated, has no effect.
 
         Raises:
-            Exception: If an attempt is made to rewire a leaf node or a node with multiple
-                inputs/outputs.
+            Exception: If an attempt is made to rewire a node with
+            multiple inputs/outputs.
 
+        Note:
+            The `rewire` parameter is deprecated and has no effect.
         """
-        if rewire:
-            inputs = [inp for inp in node.inputs if inp]
-            outputs = [outp for outp in node.outputs if outp]
-            if len(inputs) > 1 or len(outputs) > 1:
-                raise Exception('Cannot rewire a node with multiple inputs/outputs')
-            prev_node = node.get_input_node(node.inputs[0])
+
+        inputs = [inp for inp in node.inputs if inp]
+        outputs = [outp for outp in node.outputs if outp]
+
+        if len(inputs) > 1 or len(outputs) > 1:
+            raise Exception('Cannot delete a node with multiple inputs/outputs')
+
+        if len(inputs) == 1:
+            # Connect inputs -> $outputs
+            if node.name in self.outputs:
+                msg = f'Remove leaf node {node.name} will connect its input node {inputs[0]} to output, but it already is.'
+                assert inputs[0] not in self.outputs, msg
+                self.outputs = [inputs[0] if name == node.name else name for name in self.outputs]
+
+        if len(outputs) == 1 and len(inputs) == 1:
+            inp_var = node.get_input_variable()
+            out_var = node.get_output_variable()
+
+            # fmt: off
+            assert (np.prod(inp_var.shape) == np.prod(out_var.shape)), \
+                f'Input and output shapes do not match for {node.name}: {inp_var.shape} -> {out_var.shape}'
+            # fmt: on
+
             next_nodes = [x for x in self.graph.values() if node.outputs[0] in x.inputs]
-            if prev_node is not None:
-                if len(next_nodes) > 0:
-                    for next_node in next_nodes:
-                        for i, _ in enumerate(next_node.inputs):
-                            if node.outputs[0] == next_node.inputs[i]:
-                                next_node.inputs[i] = prev_node.outputs[0]
-                                break
-                else:
-                    if not node.outputs[0] in self.outputs:
-                        raise Exception('Cannot rewire a node without child')
-            else:
-                raise Exception('Cannot rewire a node without a parent')
+            for next_node in next_nodes:
+                # Connect inputs -> next
+                for i, nxt_inp in enumerate(next_node.inputs):
+                    if outputs[0] == nxt_inp:
+                        next_node.inputs[i] = inputs[0]
 
         del self.output_vars[node.outputs[0]]
         del self.graph[node.name]
-        self._update_model_outputs()
 
     def replace_node(self, old_node, new_node):
         """Replace an existing node in the graph with a new one.
@@ -625,7 +596,11 @@ def replace_node(self, old_node, new_node):
                     node.outputs[i] = repl[n]
 
         self.graph = OrderedDict((new_node.name, new_node) if k == old_node.name else (k, v) for k, v in self.graph.items())
-        self._update_model_outputs()
+
+        old_name = old_node.name
+        if old_name in self.outputs:
+            new_name = new_node.name
+            self.outputs = [new_name if name == old_name else name for name in self.outputs]
 
     def split_node(self, old_node, new_node1, new_node2):
         """Replace an existing node in the graph with two nodes in sequence.
@@ -663,17 +638,9 @@ def split_node(self, old_node, new_node1, new_node2):
             else:
                 new_graph[key] = value
         self.graph = new_graph
-        self._update_model_outputs()
-
-    def _update_model_outputs(self):
-        '''Update the model outputs
 
-        All node outputs and inputs are found. The model outputs are set to all node outputs
-        that are not also node inputs.
-        '''
-        node_outputs = [out for node in self.graph.values() for out in node.outputs]
-        node_inputs = [inp for node in self.graph.values() for inp in node.inputs]
-        self.outputs = [out for out in node_outputs if out not in node_inputs]
+        if old_node.name in self.outputs:
+            self.outputs = [new_node2.name if name == old_node.name else name for name in self.outputs]
 
     def next_layer(self):
         self.index += 1
@@ -805,37 +772,24 @@ def predict(self, x):
         n_inputs = len(self.get_input_variables())
         n_outputs = len(self.get_output_variables())
 
-        curr_dir = os.getcwd()
-        newdir = (
-            self.config.get_output_dir() + '/firmware'
-            if os.path.exists(self.config.get_output_dir() + '/firmware')
-            else self.config.get_output_dir() + '/src/firmware'
-        )
-        os.chdir(newdir)
-
         output = []
         if n_samples == 1 and n_inputs == 1:
             x = [x]
 
-        try:
-            for i in range(n_samples):
-                predictions = [np.zeros(yj.size(), dtype=ctype) for yj in self.get_output_variables()]
-                if n_inputs == 1:
-                    inp = [np.asarray(x[i])]
-                else:
-                    inp = [np.asarray(xj[i]) for xj in x]
-                argtuple = inp
-                argtuple += predictions
-                argtuple = tuple(argtuple)
-                top_function(*argtuple)
-                output.append(predictions)
-
-            # Convert to list of numpy arrays (one for each output)
-            output = [
-                np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs)
-            ]
-        finally:
-            os.chdir(curr_dir)
+        for i in range(n_samples):
+            predictions = [np.zeros(yj.size(), dtype=ctype) for yj in self.get_output_variables()]
+            if n_inputs == 1:
+                inp = [np.asarray(x[i])]
+            else:
+                inp = [np.asarray(xj[i]) for xj in x]
+            argtuple = inp
+            argtuple += predictions
+            argtuple = tuple(argtuple)
+            top_function(*argtuple)
+            output.append(predictions)
+
+        # Convert to list of numpy arrays (one for each output)
+        output = [np.asarray([output[i_sample][i_output] for i_sample in range(n_samples)]) for i_output in range(n_outputs)]
 
         if n_samples == 1 and n_outputs == 1:
             return output[0][0]
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 8054f41ee6..3847cda9cf 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -22,9 +22,11 @@
     IntegerPrecisionType,
     NamedType,
     TensorVariable,
+    UnspecifiedPrecisionType,
     WeightVariable,
     find_minimum_width,
 )
+from hls4ml.utils import attribute_descriptions as descriptions
 from hls4ml.utils.string_utils import convert_to_snake_case
 
 
@@ -52,9 +54,9 @@ class Layer:
     """
 
     _expected_attributes = [
-        Attribute('index'),
-        ConfigurableAttribute('trace', default=False),
-        TypeAttribute('result'),
+        Attribute('index', description=descriptions.index),
+        ConfigurableAttribute('trace', default=False, description=descriptions.trace),
+        TypeAttribute('result', description=descriptions.result_type),
     ]
 
     @classproperty
@@ -174,10 +176,12 @@ def _wrap_precision_to_type(self, name, precision):
         return NamedType(name=name, precision=precision)
 
     def _set_accum_t(self):
-        has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
-        if has_accum_t:
-            accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
-            self.set_attr('accum_t', accum_t)
+        """Set the accumulator, but don't overwrite an existing one"""
+        if self.get_attr('accum_t') is None:
+            has_accum_t = any(a for a in self.expected_attributes if a.name == 'accum_t' and isinstance(a, TypeAttribute))
+            if has_accum_t:
+                accum_t = NamedType(*reversed(self.model.config.get_precision(self, 'accum')))
+                self.set_attr('accum_t', accum_t)
 
     def _set_type_t(self, name):
         has_type_t = any(a for a in self.expected_attributes if a.name == name + '_t' and isinstance(a, TypeAttribute))
@@ -344,7 +348,7 @@ class Input(Layer):
     def initialize(self):
         shape = self.attributes['input_shape']
         if shape[0] is None:
-            shape = shape[1:]
+            raise RuntimeError(f"Unexpectedly have a None in {shape=} of Input layer")
         dims = [f'N_INPUT_{i}_{self.index}' for i in range(1, len(shape) + 1)]
         if self.index == 1:
             default_type_name = 'input_t'
@@ -355,6 +359,50 @@ def initialize(self):
         self.add_output_variable(shape, dims, var_name=self.name, type_name=type_name, precision=precision)
 
 
+class Constant(Layer):
+    # one could consider making this a weight attribute, but given its transient nature, I am not sure it helps
+    _expected_attributes = [
+        Attribute('value', value_type=np.ndarray),
+    ]
+
+    def initialize(self):
+        value = self.attributes['value']
+        shape = list(value.shape)
+        if not shape:
+            shape = (1,)
+            self.set_attr('value', np.array([value]))
+        dims = [f'{self.name}_{i}' for i in range(len(shape))]
+        quantizer = self.get_attr('quantizer')
+
+        # the graph._make_graph function sets the input node to the previous node
+        # if it is not set. That is incorrect for Constant nodes, so remove the input node
+        self.inputs = []
+
+        # Should the else clause below be None or UnspecifiedPrecisionType
+        precision = quantizer.hls_type if quantizer is not None else UnspecifiedPrecisionType()
+
+        self.add_output_variable(shape, dims, var_name=self.name, precision=precision)
+
+
+class Quant(Layer):  # The QONNX quantization layer
+    """
+    This is a QONNX quantization layer. Optimizations should convert it
+    before HLS is produced.
+    """
+
+    _expected_attributes = [
+        Attribute('narrow', value_type=bool),
+        Attribute('rounding_mode', value_type=str),
+        Attribute('signed', value_type=bool),
+    ]
+
+    def initialize(self):
+        inp = self.get_input_variable(self.inputs[0])
+        shape = inp.shape
+        dims = inp.dim_names
+        self.add_output_variable(shape, dims)
+
+
 class Reshape(Layer):
     _expected_attributes = [
         Attribute('target_shape', value_type=typing.Sequence),
@@ -362,17 +410,18 @@ class Reshape(Layer):
 
     def initialize(self):
         input_shape = self.get_input_variable(self.inputs[0]).shape
-        target_shape = self.get_attr('target_shape')
+        target_shape = self.get_attr('target_shape')  # this should not have a batch dimension
         if target_shape is None:
             # need to get it from the input
             shape_node = self.get_input_node(self.inputs[1])
             # for QONNX, remove batch dimension
-            if shape_node:
-                target_shape = shape_node.value[1:]
+            # (onnx cleaning should have removed reshapes not on data path)
+            if isinstance(shape_node, Constant):
+                target_shape = shape_node.attributes['value'][1:]
             else:
                 raise RuntimeError("Reshape for ONNX requires the target shape to be a second input.")
 
-        # remove Nones -- is this ever triggered?
+        # remove Nones -- Seems to be used by pytorch parser
         if target_shape[0] is None:
             target_shape = target_shape[1:]
 
@@ -406,7 +455,7 @@ class Dense(Layer):
     ]
 
     def initialize(self):
-        shape = self.get_input_variable().shape[:]
+        shape = list(self.get_input_variable().shape)
         shape[-1] = self.attributes['n_out']
         if len(shape) > 1:
             dims = [f'N_LAYER_{i}_{self.index}' for i in range(1, len(shape) + 1)]
@@ -417,6 +466,26 @@ def initialize(self):
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
 
 
+class Conv(Layer):
+    """
+    This is for the ONNX Conv node. Currently, it is only supported as an intermediate
+    form that gets converted to an explicit ConvXD.
+
+    Note:  these are always channels-last.
+    """
+
+    def initialize(self):
+        if self.attributes['n_dim'] == 1:
+            # this is 1D convolution
+            shape = [self.attributes['out_width'], self.attributes['n_filt']]
+            dims = [f'N_OUTPUTS_{self.index}', f'N_FILT_{self.index}']
+        else:
+            shape = [self.attributes['out_height'], self.attributes['out_width'], self.attributes['n_filt']]
+            dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_FILT_{self.index}']
+
+        self.add_output_variable(shape, dims)
+
+
 class Conv1D(Layer):
     _expected_attributes = [
         Attribute('in_width'),
@@ -868,7 +937,7 @@ def _get_act_function_name(self):
 
 class HardActivation(Activation):
     '''
-    Implements the hard sigmoid and tan function in keras and qkeras
+    Implements the hard sigmoid and tanh function in keras and qkeras
     (Default parameters in qkeras are different, so should be configured)
     The hard sigmoid unction is clip(slope * x + shift, 0, 1), and the
     hard tanh function is 2 * hard_sigmoid - 1
@@ -915,10 +984,24 @@ def initialize(self):
         super().initialize()
 
 
+class BatchNormOnnx(Layer):
+    '''
+    A transient layer formed from ONNX BatchNormalization that gets converted to
+    BatchNormalization after the scale and bias are determined
+    '''
+
+    def initialize(self):
+        inp = self.get_input_variable()
+        shape = inp.shape
+        dims = inp.dim_names
+        self.add_output_variable(shape, dims)
+
+
+# TODO:  We currently seem to ignore the quantizers to mean, variance, etc.
 class BatchNormalization(Layer):
     _expected_attributes = [
         Attribute('n_in'),
-        Attribute('n_filt', default=0),
+        Attribute('n_filt', default=-1),
         WeightAttribute('scale'),
         WeightAttribute('bias'),
         TypeAttribute('scale'),
@@ -945,6 +1028,36 @@ def initialize(self):
         self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
 
 
+# TODO:  discuss whether this should be renamed to soemthing more descriptive, and whether the class hierarchy makes sense
+class ApplyAlpha(BatchNormalization):
+    '''A custom layer to scale the output of a QDense layer which used 'alpha != 1'
+    Inference computation uses BatchNormalization methods'''
+
+    def initialize(self):
+        inp = self.get_input_variable()
+        shape = inp.shape
+        dims = inp.dim_names
+        self.add_output_variable(shape, dims)
+        self.set_attr('n_in', inp.size())
+
+        # precision values are ignored if quantizer is not None
+        scale = self.get_attr('scale_data')
+        scale_quantizer = self.get_attr('scale_quantizer')
+        scale_precision = self.get_attr('scale_precision')
+        bias = self.get_attr('bias_data')
+        bias_quantizer = self.get_attr('bias_quantizer')
+        bias_precision = self.get_attr('bias_precision')
+
+        self.add_weights(scale, quantizer=scale_quantizer, precision=scale_precision)
+        self.add_bias(bias, quantizer=bias_quantizer, precision=bias_precision)
+
+    def add_weights(self, scale, quantizer=None, precision=None):
+        self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer, precision=precision)
+
+    def add_bias(self, bias, quantizer=None, precision=None):
+        self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer, precision=precision)
+
+
 class Merge(Layer):
     def initialize(self):
         assert len(self.inputs) == 2
@@ -959,6 +1072,31 @@ def initialize(self):
         self.add_output_variable(shape, dims)
 
 
+class MatMul(Layer):
+    """
+    This is a matrix multiply. Currently, it is only supported as an intermediate
+    form that gets converted to a Dense layer.
+    """
+
+    def initialize(self):
+        assert len(self.inputs) == 2
+        inp1 = self.get_input_variable(self.inputs[0])
+        inp2 = self.get_input_variable(self.inputs[1])
+        if len(inp2.shape) == 1:
+            # mat vec multiply
+            assert inp1.shape[-1] == inp2.shape[0]
+            shape = list(inp1.shape[:-1]) + [inp2.shape[0]]
+        else:
+            assert inp1.shape[-1] == inp2.shape[-2]
+            shape = list(inp1.shape[:-1]) + [inp2.shape[-1]]
+        if len(shape) > 1:
+            dims = [f'N_LAYER_{i}_{self.index}' for i in range(1, len(shape) + 1)]
+        else:
+            dims = [f'N_LAYER_{self.index}']
+
+        self.add_output_variable(shape, dims)
+
+
 class Dot(Merge):
     def initialize(self):
         assert len(self.inputs) == 2
@@ -1012,20 +1150,67 @@ class Resize(Layer):
     def initialize(self):
         inp = self.get_input_variable()
 
-        if self.get_attr('data_format') == 'channels_last':
-            if len(inp.shape) == 2:  # 1D -> width + chan
-                shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
-                dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
-            elif len(inp.shape) == 3:  # 2D -> height + width + chan
-                shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
-                dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+        if len(self.inputs) > 1:
+            # In order to be correctly ingested by hls4ml the QONNX resize node should have 3 inputs set with RoI left empty
+            if len(self.inputs) == 2:
+                raise Exception(
+                    'The number of inputs to Resize node is equal to 2. '
+                    'In this case, either one is trying to use a version 10 node '
+                    'or one is using the RoI parameter only to perform the resize operation, '
+                    'both not supported in hls4ml'
+                )
+            if len(self.inputs) == 4:
+                raise Exception('Sizes parameter is not supported by hls4ml. Use scales instead')
+            # get the scales of Resize node from QONNX frontend
+            # see doc here: https://onnx.ai/onnx/operators/onnx__Resize.html
+            scales_idx = 2 if len(self.inputs) == 3 or len(self.inputs) == 4 else 1
+            scales = self.get_input_node(self.inputs[scales_idx]).get_attr('value')
+            if len(scales) == 4:  # Resize 2D
+                self.set_attr('out_width', int(self.get_attr('in_width') * scales[1]))
+                self.set_attr('out_height', int(self.get_attr('in_height') * scales[2]))
+                self.set_attr('n_chan', int(self.get_attr('n_chan') * scales[3]))
+            elif len(scales) == 3:  # Resize 1D
+                self.set_attr('out_width', int(self.get_attr('in_width') * scales[1]))
+                self.set_attr('n_chan', int(self.get_attr('n_chan') * scales[2]))
+            else:
+                raise Exception('Resize 1D and Resize 2D are the ones supported in hls4ml')
+            if self.get_attr('data_format') == 'channels_last':
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [int(self.get_attr('out_width')), int(self.get_attr('n_chan'))]
+                    dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [
+                        int(self.get_attr('out_height')),
+                        int(self.get_attr('out_width')),
+                        int(self.get_attr('n_chan')),
+                    ]
+                    dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+            else:
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [int(self.get_attr('n_chan')), int(self.get_attr('out_width'))]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [
+                        int(self.get_attr('n_chan')),
+                        int(self.get_attr('out_height')),
+                        int(self.get_attr('out_width')),
+                    ]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
         else:
-            if len(inp.shape) == 2:  # 1D -> width + chan
-                shape = [self.get_attr('n_chan'), self.get_attr('out_width')]
-                dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
-            elif len(inp.shape) == 3:  # 2D -> height + width + chan
-                shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')]
-                dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
+            if self.get_attr('data_format') == 'channels_last':
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [self.get_attr('out_width'), self.get_attr('n_chan')]
+                    dims = [f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [self.get_attr('out_height'), self.get_attr('out_width'), self.get_attr('n_chan')]
+                    dims = [f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}', f'N_CHAN_{self.index}']
+            else:
+                if len(inp.shape) == 2:  # 1D -> width + chan
+                    shape = [self.get_attr('n_chan'), self.get_attr('out_width')]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_WIDTH_{self.index}']
+                elif len(inp.shape) == 3:  # 2D -> height + width + chan
+                    shape = [self.get_attr('n_chan'), self.get_attr('out_height'), self.get_attr('out_width')]
+                    dims = [f'N_CHAN_{self.index}', f'OUT_HEIGHT_{self.index}', f'OUT_WIDTH_{self.index}']
 
         self.add_output_variable(shape, dims, precision=inp.type.precision)
 
@@ -1434,6 +1619,7 @@ def initialize(self):
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
+    'Constant': Constant,
     'Activation': Activation,
     'QActivation': Activation,
     'LeakyReLU': ParametrizedActivation,
@@ -1448,6 +1634,7 @@ def initialize(self):
     'BinaryDense': Dense,
     'TernaryDense': Dense,
     'QDense': Dense,
+    'Conv': Conv,
     'Conv1D': Conv1D,
     'QConv1D': Conv1D,
     'Conv2D': Conv2D,
@@ -1474,6 +1661,7 @@ def initialize(self):
     'ZeroPadding1D': ZeroPadding1D,
     'ZeroPadding2D': ZeroPadding2D,
     'Merge': Merge,
+    'MatMul': MatMul,
     'Dot': Dot,
     'Concatenate': Concatenate,
     'Resize': Resize,
@@ -1489,6 +1677,9 @@ def initialize(self):
     'QGRU': GRU,
     'GarNet': GarNet,
     'GarNetStack': GarNetStack,
+    'Quant': Quant,
+    'ApplyAlpha': ApplyAlpha,
+    'BatchNormOnnx': BatchNormOnnx,
     'LayerGroup': LayerGroup,
     'SymbolicExpression': SymbolicExpression,
     # TensorFlow-specific layers:
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 77e38b0c5b..7e9325ccd0 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -30,11 +30,36 @@
 del module_path
 del optimizers
 
+register_flow(
+    'parse_qonnx',
+    [
+        'reshape_constant',
+        'resize_remove_constants',
+        'quant_constant_parameters',
+        'quant_to_activation',
+        'fuse_quant_with_constant',
+        'const_quant_to_const_alpha',
+        'quant_to_alpha_activation_alpha',
+        'batch_norm_onnx_constant_parameters',
+        'constant_batch_norm_fusion',
+        'merge_two_constants',
+        'scale_down_add',
+        'bias_down_add',
+        'scale_down_mat_mul',
+        'scale_down_conv',
+        'merge_to_apply_alpha',
+        'merge_to_apply_alpha_div',
+        'matmul_const_to_dense',
+        'conv_to_conv_x_d',
+        'conv_to_depthwise_conv_x_d',
+    ],
+)
+
 register_flow(
     'convert',
     [
         'channels_last_converter',
-        'seperable_to_depthwise_and_conv',
+        'separable_to_depthwise_and_conv',
         'remove_transpose_before_flatten',
         'remove_nop_transpose',
         'remove_single_channel_transpose',
@@ -48,13 +73,17 @@
         'replace_multidimensional_dense_with_conv',
         'enforce_proxy_model_embedded_config',
         'eliminate_linear_activation',
+        'merge_linear_activation',
         # many of the above optimzers need to be done before this
         'infer_precision_types',
     ],
+    requires=['parse_qonnx'],
 )  # TODO Maybe not all QKeras optmizers belong here?
 
 register_flow(
     'optimize',
-    [],
+    [
+        'remove_nop_batch_normalization',
+    ],
     requires=['convert'],
 )
diff --git a/hls4ml/model/optimizer/passes/batchnorm_opt.py b/hls4ml/model/optimizer/passes/batchnorm_opt.py
new file mode 100644
index 0000000000..26b7b18e38
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/batchnorm_opt.py
@@ -0,0 +1,274 @@
+import warnings
+
+import numpy as np
+
+from hls4ml.model.layers import BatchNormalization, BatchNormOnnx, Constant
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.quantizers import QuantNodeQuantizer
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType
+
+_base_attributes = ('epsilon', 'n_in', 'n_filt')
+
+
+class BatchNormOnnxConstantParameters(OptimizerPass):
+    """Remove Constant from the BatchNormalization node parameters (but not input[0])"""
+
+    def match(self, node):
+        is_match = isinstance(node, BatchNormOnnx) and any(node.inputs[1:])
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove Constant from the BatchNormalization node parameters (but not input[0])
+
+        TODO:  Currently the quantizers are not actually used by the underlying layer.
+        """
+
+        if not (len(node.inputs) == 5 and all(node.inputs)):
+            raise ValueError('All 5 BatchNormOnnnx inputs need to be defined')
+
+        attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes}
+
+        gamma_node = node.get_input_node(node.inputs[1])
+        if not isinstance(gamma_node, Constant):
+            raise TypeError('Only constant gammas supported')
+        gamma = gamma_node.attributes['value']
+        attributes['gamma_data'] = gamma
+        attributes['gamma_quantizer'] = gamma_node.get_attr('quantizer')
+
+        node.inputs[1] = ''
+        model.remove_node(gamma_node, rewire=False)
+
+        beta_node = node.get_input_node(node.inputs[2])
+        if not isinstance(beta_node, Constant):
+            raise TypeError('Only constant betas supported')
+        beta = beta_node.attributes['value']
+        attributes['beta_data'] = beta
+        attributes['beta_quantizer'] = beta_node.get_attr('quantizer')
+        node.inputs[2] = ''
+        model.remove_node(beta_node, rewire=False)
+
+        moving_mean_node = node.get_input_node(node.inputs[3])
+        if not isinstance(moving_mean_node, Constant):
+            raise TypeError('Only constant moving_means supported')
+        moving_mean = moving_mean_node.attributes['value']
+        attributes['mean_data'] = moving_mean
+        attributes['mean_quantizer'] = moving_mean_node.get_attr('quantizer')
+        node.inputs[3] = ''
+        model.remove_node(moving_mean_node, rewire=False)
+
+        moving_variance_node = node.get_input_node(node.inputs[4])
+        if not isinstance(moving_variance_node, Constant):
+            raise TypeError('Only constant moving_variances supported')
+        moving_variance = moving_variance_node.attributes['value']
+        attributes['variance_data'] = moving_variance
+        attributes['variance_quantizer'] = moving_variance_node.get_attr('quantizer')
+        node.inputs[4] = ''
+        model.remove_node(moving_variance_node, rewire=False)
+
+        node.inputs = [inp for inp in node.inputs if inp]
+        if len(node.inputs) != 1:
+            raise RuntimeError('The QONNX batchnorm had unexpected inputs.')
+
+        new_node = model.make_node(BatchNormalization, node.name, attributes, [node.inputs[0]], [x for x in node.outputs])
+
+        model.replace_node(node, new_node)
+
+        return True
+
+
+# Most likely this case is removed by qonnx cleaning
+class ConstantBatchNormFusion(OptimizerPass):
+    """
+    Merge BatchNorm into Const (after parameters have already been merged in BatchNormalization)
+    """
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, BatchNormalization)
+            and not any(node.inputs[1:])
+            and isinstance(node.get_input_node(node.inputs[0]), Constant)
+            and isinstance(
+                node.get_input_node(node.inputs[0]).get_output_variable().type.precision, UnspecifiedPrecisionType
+            )
+        )
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove the batch norm
+        """
+        warnings.warn('ConstantBatchNormFusion should probably not be triggered. Check the optimizer order.', stacklevel=2)
+        const_node = node.get_input_node(node.inputs[0])
+
+        const_prec = const_node.get_output_variable().type.precision
+
+        new_val = (
+            const_node.attributes['value'] * node.weights['scale'].data_unquantized + node.weights['bias'].data_unquantized
+        )
+
+        const_node.set_attr('value', new_val)
+        const_node.set_attr('quantizer', node.get_attr('quantizer'))  # None if not defined
+
+        if isinstance(node.get_output_variable().type.precision, UnspecifiedPrecisionType):
+            if isinstance(const_prec, UnspecifiedPrecisionType):
+                pass  # leave it as is
+            else:
+                const_node.get_output_variable().type.precision = UnspecifiedPrecisionType()  # default
+                # propagate precision
+                scale_q = node.get_attr('scale_quantizer')
+                bias_q = node.get_attr('bias_quantizer')
+                if scale_q and bias_q:
+                    # propagate precsion
+                    scale_prec = scale_q.hls_type
+                    bias_prec = bias_q.hls_type
+                    if scale_prec not in (IntegerPrecisionType, FixedPrecisionType) or bias_prec not in (
+                        IntegerPrecisionType,
+                        FixedPrecisionType,
+                    ):
+                        print("Warning:  output type not propagated for constant merge")
+                    else:
+                        signed_prod = const_prec.signed or scale_prec.signed
+                        w_prod = const_prec.width + scale_prec.width
+                        i_prod = const_prec.integer + scale_prec.integer
+                        signed = signed_prod or bias_prec.signed
+                        i_tot = (
+                            max(
+                                i_prod + (bias_prec.signed and not signed_prod),
+                                bias_prec.ingeter + (signed_prod and not bias_prec.signed),
+                            )
+                            + 1
+                        )
+                        w_tot = i_tot + max(w_prod - i_prod, bias_prec.width - bias_prec.integer)
+                        new_prec = FixedPrecisionType(w_tot, i_tot, signed)
+                        const_node.set_attr('quantizer', QuantNodeQuantizer(new_prec))
+                        const_node.get_output_variable().type.precision = new_prec
+        else:
+            const_node.get_output_variable().type.precision = node.get_output_variable().type.precision
+
+        # remove the batch norm node
+        model.remove_node(node, rewire=True)
+
+        return True
+
+
+class FuseConsecutiveBatchNormalization(OptimizerPass):
+    """
+    OptimizerPass to merge consecutive BatchNormalization layers, only if the earlier one does not have the output type
+    specified. There is a further check on the compatibility to merge: except in cases when merging a scale of 1 or a
+    bias of 0, this does not merge when both scales or both biases are quantized.
+
+    Note:  Consider restricting this to ApplyAlpha.  Batch Normalization-style quantization seems to be ignored.
+
+    Note:  This optimizer may not be safe if weights are updateable, in particular if a scale can go from ones to other
+    values or if a bias can go from zeros to other values.
+    """
+
+    def match(self, node):
+        prev_node = node.get_input_node()
+        basic_match = (
+            isinstance(node, BatchNormalization)
+            and isinstance(prev_node, BatchNormalization)
+            and isinstance(prev_node.get_output_variable().type.precision, UnspecifiedPrecisionType)
+        )
+
+        # check for compatibility to merge
+        if basic_match:
+            s0 = prev_node.weights['scale'].data_unquantized
+            b0 = prev_node.weights['bias'].data_unquantized
+            s1 = node.weights['scale'].data_unquantized
+            b1 = node.weights['bias'].data_unquantized
+            scale_compatible = (
+                (prev_node.get_attr('scale_quantizer') is None or node.get_attr('scale_quantizer') is None)
+                or (s0 == np.ones_like(s0)).all()
+                or (s1 == np.ones_like(s1)).all()
+            )
+            bias_compatible = (
+                (prev_node.get_attr('bias_quantizer') is None or node.get_attr('bias_quantizer') is None)
+                or (b0 == np.zeros_like(b0)).all()
+                or (b1 == np.zeros_like(b1)).all()
+            )
+            return scale_compatible and bias_compatible
+        else:
+            return False
+
+    def transform(self, model, node):
+        prev_node = node.get_input_node()
+
+        prev_map = prev_node.get_output_use_map()
+        if len(prev_map[prev_node.outputs[0]]) > 1:
+            return False
+
+        s0 = prev_node.weights['scale'].data_unquantized
+        b0 = prev_node.weights['bias'].data_unquantized
+        s1 = node.weights['scale'].data_unquantized
+        b1 = node.weights['bias'].data_unquantized
+
+        if (s0 == np.ones_like(s0)).all():
+            s_quantizer = node.get_attr('scale_quantizer')
+        elif (s1 == np.ones_like(s1)).all():
+            s_quantizer = prev_node.get_attr('scale_quantizer')
+        else:
+            s_quantizer = None
+
+        if (b0 == np.ones_like(b0)).all():
+            b_quantizer = node.get_attr('bias_quantizer')
+        elif (b1 == np.ones_like(b1)).all():
+            b_quantizer = prev_node.get_attr('bias_quantizer')
+        else:
+            b_quantizer = None
+
+        node.set_attr('scale_quantizer', s_quantizer)
+        node.set_attr('bias_quantizer', b_quantizer)
+
+        scale_new = s0 * s1
+        bias_new = s1 * b0 + b1
+
+        # Not sure if this setting of this is useful
+        s_prec = None
+        if s_quantizer is None and (scale_new == np.ones_like(scale_new)).all():
+            if (
+                isinstance(prev_node.weights['scale'].type, IntegerPrecisionType)
+                and isinstance(node.weights['scale'].type, IntegerPrecisionType)
+                and prev_node.weights['scale'].type.width == 1
+                and node.weights['scale'].type.width == 1
+            ):
+                s_prec = node.weights['scale'].type
+
+        b_prec = None
+        if b_quantizer is None and (bias_new == np.zeros_like(bias_new)).all():
+            if (
+                isinstance(prev_node.weights['bias'].type, IntegerPrecisionType)
+                and isinstance(node.weights['bias'].type, IntegerPrecisionType)
+                and prev_node.weights['bias'].type.width == 1
+                and node.weights['bias'].type.width == 1
+            ):
+                b_prec = node.weights['bias'].type
+
+        # call function so that quantizer would be called if needed
+        node.add_weights_variable(name='scale', var_name='s{index}', data=scale_new, quantizer=s_quantizer, precision=s_prec)
+        node.add_weights_variable(name='bias', var_name='b{index}', data=bias_new, quantizer=b_quantizer, precision=b_prec)
+
+        model.remove_node(prev_node, rewire=True)
+        return True
+
+
+class RemoveNopBatchNormalization(OptimizerPass):
+    """
+    OptimizerPass to remove batch normalizations that do nothing (scale 1, bias 0)
+
+    Note:  This optimizer may not be safe if weights are updateable.
+    """
+
+    def match(self, node):
+        if isinstance(node, BatchNormalization):
+            s0 = node.weights['scale'].data_unquantized
+            b0 = node.weights['bias'].data_unquantized
+            return (s0 == np.ones_like(s0)).all() and (b0 == np.zeros_like(b0)).all()
+        else:
+            return False
+
+    def transform(self, model, node):
+        model.remove_node(node, rewire=True)
+        return True
diff --git a/hls4ml/model/optimizer/passes/bn_fuse.py b/hls4ml/model/optimizer/passes/bn_fuse.py
index 02d9b849ed..be81d5fb3d 100644
--- a/hls4ml/model/optimizer/passes/bn_fuse.py
+++ b/hls4ml/model/optimizer/passes/bn_fuse.py
@@ -1,23 +1,54 @@
+import numpy as np
+
 from hls4ml.model.layers import BatchNormalization, Conv1D, Conv2D, Dense
 from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, UnspecifiedPrecisionType
 
 
 class FuseBatchNormalization(OptimizerPass):
+    """
+    OptimizerPass to merge a BatchNormalization layer with Dense or Conv layer, only if the Dense or Conv layer does not
+    have the output type specified. There is a further check on the compatibility to merge: except in cases when merging a
+    weight/scale of 1 or a bias of 0, this optimizer does not merge nodes when both the weight and scale or both biases
+    are quantized.
+
+    Note:  Consider restricting this to ApplyAlpha.  Batch Normalization quantization seems to be ignored.
+
+    Note:  This optimizer may not be safe if weights are updateable. May need to turn off.
+    """
+
     def match(self, node):
-        is_match = (
+        prev_node = node.get_input_node()
+        basic_match = (
             isinstance(node, BatchNormalization)
-            and isinstance(node.get_input_node(), (Dense, Conv1D, Conv2D))
-            and node.get_input_node().get_attr('weight_quantizer') is None
-            and node.get_input_node().get_attr('bias_quantizer') is None
+            and isinstance(prev_node, (Dense, Conv1D, Conv2D))
+            and isinstance(prev_node.get_output_variable().type.precision, UnspecifiedPrecisionType)
         )
-        return is_match
+        if basic_match:
+            s0 = prev_node.weights['weight'].data_unquantized
+            b0 = prev_node.weights['bias'].data_unquantized
+            s1 = node.weights['scale'].data_unquantized
+            b1 = node.weights['bias'].data_unquantized
+            scale_compatible = (
+                (prev_node.get_attr('weight_quantizer') is None and node.get_attr('scale_quantizer') is None)
+                or ((s0 == np.ones_like(s0)).all() and prev_node.get_attr('weight_quantizer') is None)
+                or ((s1 == np.ones_like(s1)).all() and node.get_attr('scale_quantizer') is None)
+            )
+            bias_compatible = (
+                (prev_node.get_attr('bias_quantizer') is None and node.get_attr('bias_quantizer') is None)
+                or ((b0 == np.zeros_like(b0)).all() and prev_node.get_attr('bias_quantizer') is None)
+                or ((b1 == np.zeros_like(b1)).all() and node.get_attr('bias_quantizer') is None)
+            )
+            return scale_compatible and bias_compatible
+
+        else:
+            return False
 
     def transform(self, model, node):
-        # Fuse weight and bias of Dense/Conv1D/Conv2D layer with BN values
+        """Fuse weight and bias of Dense/Conv1D/Conv2D layer with BN values."""
         parent_node = node.get_input_node()
         parent_map = parent_node.get_output_use_map()
-        node_map = node.get_output_use_map()
-        if len(parent_map[parent_node.name]) > 1 or len(node_map[node.name]) > 1:
+        if len(parent_map[parent_node.outputs[0]]) > 1:
             return False
 
         parent_weight = parent_node.weights['weight']
@@ -26,13 +57,38 @@ def transform(self, model, node):
         bn_scale = node.weights['scale']
         bn_bias = node.weights['bias']
 
+        allowed_precisions = (IntegerPrecisionType, FixedPrecisionType, UnspecifiedPrecisionType)
+
+        # only merge if the types are integer or fixed
+        if (
+            not isinstance(parent_weight.type.precision, allowed_precisions)
+            or not isinstance(parent_bias.type.precision, allowed_precisions)
+            or not isinstance(bn_scale.type.precision, allowed_precisions)
+            or not isinstance(bn_bias.type.precision, allowed_precisions)
+        ):
+            return False
+
         fused_weight = bn_scale.data * parent_weight.data
         fused_bias = bn_scale.data * parent_bias.data + bn_bias.data
 
+        w_quantizer = (
+            node.get_attr('scale_quantizer')
+            if node.get_attr('scale_quantizer') is not None
+            else parent_node.get_attr('weight_quantizer')
+        )
+        b_quantizer = (
+            node.get_attr('bias_quantizer')
+            if node.get_attr('bias_quantizer') is not None
+            else parent_node.get_attr('bias_quantizer')
+        )
+
+        node.set_attr('weight_quantizer', w_quantizer)
+        node.set_attr('bias_quantizer', b_quantizer)
+
+        # call function so that quantizer would be called if needed
+        parent_node.add_weights_variable(name='weight', var_name='w{index}', data=fused_weight, quantizer=w_quantizer)
+        parent_node.add_weights_variable(name='bias', var_name='b{index}', data=fused_bias, quantizer=b_quantizer)
+
         model.remove_node(node, rewire=True)
-        parent_weight.data = fused_weight
-        parent_bias.data = fused_bias
-        if not parent_node.get_attr('use_bias', True):
-            parent_bias.update_precision(bn_bias.type.precision)
 
         return True
diff --git a/hls4ml/model/optimizer/passes/conv_to_convxd.py b/hls4ml/model/optimizer/passes/conv_to_convxd.py
new file mode 100644
index 0000000000..3e870e43a6
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/conv_to_convxd.py
@@ -0,0 +1,93 @@
+import numpy as np
+
+from hls4ml.model.layers import Constant, Conv, Conv1D, Conv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+# these are attributes to copy
+_base_attributes = (
+    'in_width',
+    'out_width',
+    'n_chan',
+    'n_filt',
+    'pad_left',
+    'pad_right',
+    'filt_width',
+    'stride_width',
+    'dilation_width',
+    'in_height',
+    'out_height',
+    'pad_top',
+    'pad_bottom',
+    'filt_height',
+    'stride_height',
+    'dilation_height',
+    'data_format',
+)
+
+
+class ConvToConvXD(OptimizerPass):
+    """Convert Conv with constant to a Conv1D or Conv2D layer"""
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Conv)
+            and node.get_attr('group') == 1
+            and (
+                (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant))
+                or (
+                    len(node.inputs) == 3
+                    and isinstance(node.get_input_node(node.inputs[1]), Constant)
+                    and isinstance(node.get_input_node(node.inputs[2]), Constant)
+                )
+            )
+        )
+
+        return is_match
+
+    def transform(self, model, node):
+        """Convert Conv with constant to a Conv1D or Conv2D layer"""
+
+        weight_node = node.get_input_node(node.inputs[1])
+        weight_data = weight_node.attributes['value']
+        bias_node = None
+        if len(node.inputs) == 3:
+            bias_node = node.get_input_node(node.inputs[2])
+
+        # creating the attributes
+        attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes}
+
+        # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C)
+        if node.attributes['n_dim'] == 1:
+            newtype = Conv1D
+            attributes['weight_data'] = np.transpose(weight_data, (1, 2, 0))
+        else:
+            newtype = Conv2D
+            attributes['weight_data'] = np.transpose(weight_data, (1, 2, 3, 0))
+        attributes['weight_quantizer'] = weight_node.get_attr('quantizer')
+
+        if bias_node:
+            attributes['bias_data'] = bias_node.attributes['value']
+            attributes['bias_quantizer'] = bias_node.get_attr('quantizer')
+            attributes['use_bias'] = True
+        else:
+            attributes['bias_data'] = np.zeros(attributes['n_filt'])
+            attributes['use_bias'] = False
+
+        # get the configuration name
+        config = model.config.get_layer_config(node)
+        new_name = f'{newtype.__name__}_{node.name}'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        # making new node
+        new_node = model.make_node(newtype, new_name, attributes, [node.inputs[0]], [x for x in node.outputs])
+
+        # removing and replacing old nodes
+        if bias_node:
+            model.remove_node(bias_node, rewire=False)
+            del node.inputs[2]
+        model.remove_node(weight_node, rewire=False)
+        del node.inputs[1]
+        model.replace_node(node, new_node)
+
+        return True
diff --git a/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py
new file mode 100644
index 0000000000..b1271b5784
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/conv_to_depthwiseconvxd.py
@@ -0,0 +1,94 @@
+import numpy as np
+
+from hls4ml.model.layers import Constant, Conv, DepthwiseConv1D, DepthwiseConv2D
+from hls4ml.model.optimizer import OptimizerPass
+
+# these are attributes to copy
+_base_attributes = (
+    'in_width',
+    'out_width',
+    'n_chan',
+    'n_filt',
+    'pad_left',
+    'pad_right',
+    'filt_width',
+    'stride_width',
+    'dilation_width',
+    'in_height',
+    'out_height',
+    'pad_top',
+    'pad_bottom',
+    'filt_height',
+    'stride_height',
+    'dilation_height',
+    'data_format',
+)
+
+
+class ConvToDepthwiseConvXD(OptimizerPass):
+    """Convert Conv with constant to a DepthwiseConv1D or DepthwiseConv2D layer"""
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Conv)
+            and node.get_attr('group') == node.get_attr('n_chan')
+            and (node.get_attr('group') != 1)
+            and (
+                (len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant))
+                or (
+                    len(node.inputs) == 3
+                    and isinstance(node.get_input_node(node.inputs[1]), Constant)
+                    and isinstance(node.get_input_node(node.inputs[2]), Constant)
+                )
+            )
+        )
+
+        return is_match
+
+    def transform(self, model, node):
+        """Convert Conv with constant to a DepthwiseConv1D or DepthwiseConv2D layer"""
+
+        weight_node = node.get_input_node(node.inputs[1])
+        weight_data = weight_node.attributes['value']
+        bias_node = None
+        if len(node.inputs) == 3:
+            bias_node = node.get_input_node(node.inputs[2])
+
+        # creating the attributes
+        attributes = {k: node.attributes[k] for k in _base_attributes if k in node.attributes}
+
+        # The ConvxD nodes expect the weight data to be in a different format, not (M, k1.., C)
+        if node.attributes['n_dim'] == 1:
+            newtype = DepthwiseConv1D
+            attributes['depthwise_data'] = np.transpose(weight_data, (1, 2, 0))
+        else:
+            newtype = DepthwiseConv2D
+            attributes['depthwise_data'] = np.transpose(weight_data, (1, 2, 3, 0))
+        attributes['depthwise_quantizer'] = weight_node.get_attr('quantizer')
+
+        if bias_node:
+            attributes['bias_data'] = bias_node.attributes['value']
+            attributes['bias_quantizer'] = bias_node.get_attr('quantizer')
+            attributes['use_bias'] = True
+        else:
+            attributes['bias_data'] = np.zeros(attributes['n_filt'])
+            attributes['use_bias'] = False
+
+        # get the configuration name
+        config = model.config.get_layer_config(node)
+        new_name = f'{newtype.__name__}_{node.name}'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        # making new node
+        new_node = model.make_node(newtype, new_name, attributes, [node.inputs[0]], [x for x in node.outputs])
+
+        # removing and replacing old nodes
+        if bias_node:
+            model.remove_node(bias_node, rewire=False)
+            del node.inputs[2]
+        model.remove_node(weight_node, rewire=False)
+        del node.inputs[1]
+        model.replace_node(node, new_node)
+
+        return True
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index a3b861ddfe..0b5f12c008 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -94,7 +94,11 @@ def transform(self, model, node):
                 node.add_output_variable(shape, dims)
 
             # Have to transpose back before flattening to get correct order of elements in the flattened tensor
-            if isinstance(node, Reshape) and len(node.attributes['target_shape']) == 1:
+            if (
+                isinstance(node, Reshape)
+                and len(node.attributes['target_shape']) == 1
+                and not model.config.config['HLSConfig']['Model']['ChannelsLastConversion'] == "internal"
+            ):
                 previous_node = node.get_input_node(node.inputs[0])
                 input = previous_node.name
                 outshape = previous_node.get_output_variable().shape
diff --git a/hls4ml/model/optimizer/passes/linear.py b/hls4ml/model/optimizer/passes/linear.py
new file mode 100644
index 0000000000..ce0308eb66
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/linear.py
@@ -0,0 +1,45 @@
+from hls4ml.model.layers import Activation, BatchNormalization, Conv1D, Conv2D, Dense
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import UnspecifiedPrecisionType
+
+
+class EliminateLinearActivation(OptimizerPass):
+    def match(self, node):
+        cast = False
+        if isinstance(node, Activation):
+            cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision
+        return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast
+
+    def transform(self, model, node):
+        model.remove_node(node)
+        return True
+
+
+_safe_parents = (Dense, Conv1D, Conv2D, BatchNormalization, Activation)
+
+
+class MergeLinearActivation(OptimizerPass):
+    '''
+    For many objects it's safe to change the output precision independently of the calculation.
+    '''
+
+    def match(self, node):
+        '''
+        Only match if the parent is safe and the precision is not explicitly set.
+        '''
+        if isinstance(node, Activation) and node.get_attr('activation') == 'linear':
+            parent = node.get_input_node(node.inputs[0])
+            safe_parent = isinstance(parent, _safe_parents)
+            return safe_parent and isinstance(parent.get_output_variable().type.precision, UnspecifiedPrecisionType)
+        else:
+            return False
+
+    def transform(self, model, node):
+        prev_node = node.get_input_node(node.inputs[0])
+        quantizer = node.get_attr("quantizer")
+        # if the activation has a quantizer (usually from a QONNX Quant node), set the previous node's output precision
+        if quantizer is not None:
+            prev_node.set_attr("quantizer", quantizer)
+            prev_node.get_output_variable().type.precision = quantizer.hls_type
+        model.remove_node(node)
+        return True
diff --git a/hls4ml/model/optimizer/passes/matmul_const_to_dense.py b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py
new file mode 100644
index 0000000000..4c48944eb3
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/matmul_const_to_dense.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+from hls4ml.model.layers import Constant, Dense, MatMul
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class MatmulConstToDense(OptimizerPass):
+    """
+    Convert MatMul with constant to a dense layer. Note, this only supports the second input
+    being the constant. If needed, one could add transposes to make that be the case in
+    other yet to be written optimizers.
+    """
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, MatMul) and len(node.inputs) == 2 and isinstance(node.get_input_node(node.inputs[1]), Constant)
+        )
+        return is_match
+
+    def transform(self, model, node):
+        """Substitute Matmul + Constant for a single dense"""
+        # determining Constant layer input
+        const_node = node.get_input_node(node.inputs[1])
+        other_var = node.get_input_variable(node.inputs[0])
+
+        weight_data = const_node.attributes['value']
+        weight_quantizer = const_node.get_attr('quantizer')
+
+        # get the configuration name
+        config = model.config.get_layer_config(node)
+        new_name = f'Dense_{node.name}'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        in_shape = other_var.shape
+        n_in = np.prod(in_shape)
+        out_shape = list(in_shape[:-1]) + [weight_data.shape[-1]]
+        n_out = np.prod(out_shape)
+
+        # creating the attributes
+        attributes = {
+            'weight_data': weight_data,
+            'weight_quantizer': weight_quantizer,
+            'bias_data': np.zeros(out_shape),
+            'use_bias': False,
+            'n_in': n_in,
+            'n_out': n_out,
+        }
+
+        # making new node
+        new_dense = model.make_node(Dense, new_name, attributes, [node.inputs[0]], [x for x in node.outputs])
+
+        # removing and replacing old nodes
+        model.remove_node(const_node, rewire=False)
+        del node.inputs[1]
+        model.replace_node(node, new_dense)
+
+        return True
diff --git a/hls4ml/model/optimizer/passes/merge_const.py b/hls4ml/model/optimizer/passes/merge_const.py
new file mode 100644
index 0000000000..bdf7447838
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/merge_const.py
@@ -0,0 +1,245 @@
+import numpy as np
+
+from hls4ml.model.layers import ApplyAlpha, Constant, Merge
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.quantizers import QuantNodeQuantizer
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
+
+
+# This should generally not happen because of qonnx cleaning
+class MergeTwoConstants(OptimizerPass):
+    """Merge of two constants makes another constant"""
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Merge)
+            and isinstance(node.get_input_node(node.inputs[0]), Constant)
+            and isinstance(node.get_input_node(node.inputs[1]), Constant)
+        )
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Merge of two constants makes another constant.
+
+        Note:  full precision is used in the calculation, and precision is not propagated.
+        The precision
+        """
+        const_node0 = node.get_input_node(node.inputs[0])
+        const_node1 = node.get_input_node(node.inputs[1])
+
+        val0 = const_node0.attributes['value']
+        val1 = const_node1.attributes['value']
+
+        op = node.attributes['op']
+        if op == 'add':
+            new_val = val0 + val1
+        elif op == 'subtract':
+            new_val = val0 - val1
+        elif op == 'multiply':
+            new_val = val0 * val1
+        elif op == 'divide':
+            new_val = val0 / val1
+        elif op == 'average':
+            new_val = np.mean(np.array([val0, val1]), axis=0)
+        elif op == 'maximum':
+            new_val = np.maximum(val0, val1)
+        elif op == 'minimum':
+            new_val = np.minimum(val0, val1)
+        else:
+            raise RuntimeError(f'Unexpected op_type: {op}')
+
+        quantizer = node.get_attr('quantizer')  # None if not defined
+        const_node0.set_attr('quantizer', quantizer)  # overwrite the quantizer
+        if quantizer:
+            const_node0.set_attr('quantizer', quantizer)
+            const_node0.get_output_variable().type.precision = quantizer.hls_type
+        const_node0.set_attr('value', new_val)
+
+        model.remove_node(const_node1, rewire=False)
+
+        # remove the batch norm node
+        model.remove_node(node, rewire=True)
+
+        return True
+
+
+class MergeToApplyAlpha(OptimizerPass):
+    """Convert Add, Sub, Mul, or Div Merges with constant to ApplyAlpha"""
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Merge)
+            and node.attributes['op'] in ('add', 'subtract', 'multiply')  # Div is separate
+            and (
+                isinstance(node.get_input_node(node.inputs[0]), Constant)
+                != isinstance(node.get_input_node(node.inputs[1]), Constant)
+            )
+        )
+        # note: != for booleans is xor.
+        return is_match
+
+    def transform(self, model, node):
+        node1 = node.get_input_node(node.inputs[1])
+
+        node1const = isinstance(node1, Constant)
+        if node1const:
+            const_node = node1
+            input_node_idx = 0
+            const_node_idx = 1
+        else:
+            const_node = node.get_input_node(node.inputs[0])
+            input_node_idx = 1
+            const_node_idx = 0
+
+        input_shape = node.get_input_variable(node.inputs[input_node_idx]).shape
+        n_in = np.prod(input_shape)
+
+        # Note:  precision is ignored if quantizer is not None
+        scale_precision = None
+        scale_quantizer = None
+        bias_precision = None
+        bias_quantizer = None
+
+        op = node.attributes['op']
+        if op == 'add':
+            scale = np.array(1)
+            scale_precision = IntegerPrecisionType(1, False)
+            bias = const_node.attributes['value']
+            bias_quantizer = const_node.get_attr('quantizer')
+        elif op == 'subtract':
+            bias_quantizer = const_node.get_attr('quantizer')
+            if node1const:
+                scale = np.array(1)
+                scale_precision = IntegerPrecisionType(1, False)
+                bias = -const_node.attributes['value']
+                if (
+                    bias_quantizer is not None
+                    and isinstance(bias_quantizer.hls_type, (IntegerPrecisionType, FixedPrecisionType))
+                    and not bias_quantizer.hls_type.signed
+                ):
+                    # need to make signed and increas the bit, if unsigned
+                    bias_precision = FixedPrecisionType(
+                        bias_quantizer.hls_type.width + 1,
+                        bias_quantizer.hls_type.integer + 1,
+                        True,
+                        bias_quantizer.hls_type.rounding_mode,
+                        bias_quantizer.hls_type.saturation_mode,
+                        bias_quantizer.hls_type.saturation_bits,
+                    )
+                    bias_quantizer = QuantNodeQuantizer(bias_precision)
+            else:
+                scale = np.array(-1)
+                scale_precision = IntegerPrecisionType(2, True)
+                bias = const_node.attributes['value']
+
+        elif op == 'multiply':
+            scale = const_node.attributes['value']
+            scale_quantizer = const_node.get_attr('quantizer')
+            bias = np.array(0)
+            bias_precision = IntegerPrecisionType(1, False)
+
+        # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias
+        if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape):
+            scale = np.broadcast_to(scale, input_shape)
+        if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape):
+            bias = np.broadcast_to(bias, input_shape)
+
+        attributes = {
+            'scale_data': scale,
+            'bias_data': bias,
+            'n_in': n_in,
+            'n_out': n_in,
+            'n_filt': -1,
+            'scale_precision': scale_precision,
+            'scale_quantizer': scale_quantizer,
+            'bias_precision': bias_precision,
+            'bias_quantizer': bias_quantizer,
+        }
+
+        # get the configuration name
+        config = model.config.get_layer_config(node)
+        new_name = f'bn_{node.name}'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        aa_layer = model.make_node(
+            ApplyAlpha, new_name, attributes, [node.inputs[input_node_idx]], [x for x in node.outputs]
+        )
+
+        model.remove_node(const_node, rewire=False)
+        del node.inputs[const_node_idx]
+        model.replace_node(node, aa_layer)
+
+        return True
+
+
+class MergeToApplyAlphaDiv(OptimizerPass):
+    """
+    Convert Div Merges with constant to ApplyAlpha
+
+    TODO:  propagate precision
+    """
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Merge)
+            and node.attributes['op'] == 'divide'
+            and isinstance(node.get_input_node(node.inputs[1]), Constant)
+        )  # only second can be const
+
+        return is_match
+
+    def transform(self, model, node):
+        input_shape = node.get_input_variable().shape
+        n_in = np.prod(input_shape)
+        const_node = node.get_input_node(node.inputs[1])
+        scale = 1 / const_node.attributes['value']
+        scale_quantizer = const_node.get_attr('quantizer')
+        if scale_quantizer:
+            scale_precision = scale_quantizer.hls_type
+            i_new = 1 + int(scale_precision.signed) + scale_precision.fractional
+            w_new = 1 + int(scale_precision.signed) + max(scale_precision.fractional, 0)
+            new_scale_precision = FixedPrecisionType(
+                w_new,
+                i_new,
+                scale_precision.signed,
+                rounding_mode=scale_precision.rounding_mode,
+                saturation_mode=scale_precision.saturation_mode,
+                saturation_bits=scale_precision.saturation_bits,
+            )
+            scale_quantizer = QuantNodeQuantizer(new_scale_precision)
+
+        bias = np.array(0)
+        bias_precision = IntegerPrecisionType(1, False)
+
+        # because C++ doesn't do broadcasting, we may have to change the shapes of the scale and bias
+        if scale.shape != tuple(input_shape) and np.squeeze(scale).shape != tuple(input_shape):
+            scale = np.broadcast_to(scale, input_shape)
+        if bias.shape != tuple(input_shape) and np.squeeze(bias).shape != tuple(input_shape):
+            bias = np.broadcast_to(bias, input_shape)
+
+        attributes = {
+            'scale_data': scale,
+            'bias_data': bias,
+            'scale_quantizer': scale_quantizer,
+            'bias_precision': bias_precision,
+            'n_in': n_in,
+            'n_out': n_in,
+            'n_filt': -1,
+        }
+
+        # get the configuration name
+        config = model.config.get_layer_config(node)
+        new_name = f'bn_{node.name}'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        bn_layer = model.make_node(ApplyAlpha, new_name, attributes, [node.inputs[0]], [x for x in node.outputs])
+
+        model.remove_node(const_node, rewire=False)
+        del node.inputs[1]
+        model.replace_node(node, bn_layer)
+
+        return True
diff --git a/hls4ml/model/optimizer/passes/move_scales.py b/hls4ml/model/optimizer/passes/move_scales.py
new file mode 100644
index 0000000000..8fba1ec405
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/move_scales.py
@@ -0,0 +1,519 @@
+'''
+This file includes optimizations related to moving the ApplyAphas across MatMul and Conv nodes.
+
+TODO:  Check that biases are properly handled. (Attempt to do it via Merge)
+
+'''
+
+import warnings
+
+import numpy as np
+
+from hls4ml.model.layers import ApplyAlpha, Constant, Conv, MatMul, Merge
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ScaleDownMatMul(OptimizerPass):
+    '''Shift an ApplyAlpha below a MatMul'''
+
+    def match(self, node):
+        '''
+        Check to see if we have a MatMul with at least one input ApplyAlpha.
+        Note, if both are this optimizer runs twice.
+        '''
+        is_match = (
+            isinstance(node, MatMul)
+            and len(node.inputs) == 2
+            and (
+                isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha)
+                or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha)
+            )
+        )
+        return is_match
+
+    def transform(self, model, node):
+        # determine input with ApplyAlpha. If both, first propagate apply alpha associated with a constant
+        is_aa = [False, False]
+        from_const = [False, False]
+        inp = [node.get_input_node(node.inputs[0]), node.get_input_node(node.inputs[1])]
+        for i in range(2):
+            if isinstance(inp[i], ApplyAlpha):
+                is_aa[i] = True
+                from_const[i] = isinstance(inp[i].get_input_node(inp[i].inputs[0]), Constant)
+
+        # prefer alpha from constant
+        if from_const[0]:
+            alpha_idx = 0
+        elif from_const[1]:
+            alpha_idx = 1
+        elif is_aa[0]:
+            alpha_idx = 0
+        else:
+            alpha_idx = 1  # is_aa[1] must be true
+
+        apply_alpha = inp[alpha_idx]
+        other_idx = 0 if alpha_idx else 1
+
+        # Check if we can move
+        scale = apply_alpha.weights['scale'].data_unquantized
+        bias = apply_alpha.weights['bias'].data_unquantized
+
+        scale, bias = _make_scalar(scale, bias)
+
+        output = node.get_output_variable()
+        # to remove warning, since these get set again
+        new_attrs = {k: v for k, v in apply_alpha.attributes.items() if k not in ('trace', 'precision')}
+
+        can_propagate = False
+        if not bias.shape and bias == 0:
+            # zero bias, propagate through, if possible
+            # (always possible if scale is scalar)
+            try:
+                newscale = np.broadcast_to(scale, output.shape)  # check size compatibility
+                newbias = np.zeros(output.shape)
+                can_propagate = True
+            except ValueError:
+                can_propagate = False
+
+        # if did not succeed in propagating, try again
+        if not can_propagate and isinstance(inp[other_idx], Constant):
+            # can handle nonzero bias in some cases if other value is a Constant
+            try:
+                newscale = np.broadcast_to(scale, output.shape)  # check size compatibility
+                newbias = np.broadcast_to(inp[other_idx].attributes['value'] * bias, output.shape)
+                new_attrs.pop('bias_precision', None)  # remove special bias precision settings
+                can_propagate = True
+            except ValueError:
+                can_propagate = False
+
+        if not can_propagate:
+            warnings.warn(
+                'Failed to propagate quantization scales down MatMul node; model probably not suppored.', stacklevel=1
+            )
+            return False
+
+        model.remove_node(apply_alpha)
+
+        new_attrs['scale_data'] = newscale
+        new_attrs['bias_data'] = newbias
+
+        new_node = model.make_node('ApplyAlpha', apply_alpha.name, new_attrs, [x for x in node.outputs])
+        model.insert_node(new_node)
+        return True
+
+
+class ScaleDownAdd(OptimizerPass):
+    '''Shift an identical ApplyAlpha below a Merge (Add)'''
+
+    def match(self, node):
+        '''Check to see if we have an add with two ApplyAlphas with identical scale'''
+        is_match = isinstance(node, Merge) and len(node.inputs) == 2 and node.attributes["op"] == "add"
+        if is_match:
+            in0 = node.get_input_node(node.inputs[0])
+            in1 = node.get_input_node(node.inputs[1])
+            is_match = (
+                isinstance(in0, ApplyAlpha)
+                and isinstance(in1, ApplyAlpha)
+                and (in0.weights['scale'].data_unquantized == in1.weights['scale'].data_unquantized).all()
+            )
+        return is_match
+
+    def transform(self, model, node):
+        in0 = node.get_input_node(node.inputs[0])
+        in1 = node.get_input_node(node.inputs[1])
+
+        # Check if we can move
+        scale = in0.weights['scale'].data_unquantized
+        bias0 = in0.weights['bias'].data_unquantized
+        bias1 = in1.weights['bias'].data_unquantized
+        try:
+            bias = bias0 + bias1
+        except ValueError:
+            warnings.warn(
+                'Failed to propagate quantization scales down Add node; model probably not suppored.', stacklevel=1
+            )
+            return False
+
+        model.remove_node(in0)
+        model.remove_node(in1)
+
+        new_attrs = in0.attributes
+        new_attrs['scale_data'] = scale
+        new_attrs['bias_data'] = bias
+
+        new_node = model.make_node('ApplyAlpha', in0.name, new_attrs, [x for x in node.outputs])
+        model.insert_node(new_node)
+        return True
+
+
+class BiasDownAdd(OptimizerPass):
+    '''Shift a ApplyAlpha with only bias below a Merge (Add)'''
+
+    def match(self, node):
+        '''Match if there is only one ApplyAlpha. If there are two, if the scale of both is 0, they would
+        match the ScaleDownAdd, so this optimizer does not need to handle that case.
+        '''
+        is_match = isinstance(node, Merge) and len(node.inputs) == 2 and node.attributes["op"] == "add"
+        if is_match:
+            in0 = node.get_input_node(node.inputs[0])
+            in1 = node.get_input_node(node.inputs[1])
+            is_match = (isinstance(in0, ApplyAlpha) or isinstance(in1, ApplyAlpha)) and not (
+                isinstance(in0, ApplyAlpha) and isinstance(in1, ApplyAlpha)
+            )  # only one ApplyAlpha
+        return is_match
+
+    def transform(self, model, node):
+        in0 = node.get_input_node(node.inputs[0])
+        in1 = node.get_input_node(node.inputs[1])
+
+        alpha_node = in0 if isinstance(in0, ApplyAlpha) else in1
+
+        # Check if we can move
+        scale = alpha_node.weights['scale'].data_unquantized
+
+        if (scale == 0).all():
+            model.remove_node(alpha_node)
+            new_node = model.make_node('ApplyAlpha', alpha_node.name, alpha_node.attributes, [x for x in node.outputs])
+            model.insert_node(new_node)
+            return True
+        else:
+            warnings.warn('Failed to propagate quantization bias down Add node; model probably not suppored.', stacklevel=1)
+            return False
+
+
+class ScaleDownConv(OptimizerPass):
+    '''Shift an ApplyAlpha on a Conv with 2-3 inputs'''
+
+    def match(self, node):
+        '''Shift an ApplyAlpha from the Weight'''
+        is_match = (
+            isinstance(node, Conv)
+            and len(node.inputs) > 1
+            and (
+                isinstance(node.get_input_node(node.inputs[0]), ApplyAlpha)
+                or isinstance(node.get_input_node(node.inputs[1]), ApplyAlpha)
+                or (len(node.inputs) == 3 and isinstance(node.get_input_node(node.inputs[2]), ApplyAlpha))
+            )
+        )
+        return is_match
+
+    def transform(self, model, node):
+        in0 = node.get_input_node(node.inputs[0])
+        in1 = node.get_input_node(node.inputs[1])
+        in2 = node.get_input_node(node.inputs[2]) if len(node.inputs) == 3 else None
+
+        aa0 = isinstance(in0, ApplyAlpha)
+        aa1 = isinstance(in1, ApplyAlpha)
+        aa2 = isinstance(in2, ApplyAlpha) if len(node.inputs) == 3 else False
+
+        if not isinstance(in1, (Constant, ApplyAlpha)):
+            raise RuntimeError("The weight node needs to be ApplyAlpha or Constant")
+        if len(node.inputs) == 3 and not isinstance(in2, (Constant, ApplyAlpha)):
+            raise RuntimeError("The bias node needs to be ApplyAlpha or Constant")
+
+        scale0 = in0.weights['scale'].data_unquantized if aa0 else None
+        bias0 = in0.weights['bias'].data_unquantized if aa0 else None
+        scale1 = in1.weights['scale'].data_unquantized if aa1 else None
+        bias1 = in1.weights['bias'].data_unquantized if aa1 else None
+        scale2 = in2.weights['scale'].data_unquantized if aa2 else None
+        bias2 = in2.weights['bias'].data_unquantized if aa2 else None
+
+        # If possible, make scale and bias have scalar values
+        if aa0:
+            scale0, bias0 = _make_scalar(scale0, bias0)
+        if aa1:
+            scale1, bias1 = _make_scalar(scale1, bias1)
+        if aa2:
+            scale2, bias2 = _make_scalar(scale2, bias2)
+
+        output = node.get_output_variable()
+        if aa0 and not aa1 and not aa2:
+            # only datapath has a scale
+            bias = in2.attributes['value'] if len(node.inputs) == 3 else 0
+            conv_nobias = np.all(bias == 0)
+
+            can_propagate = False
+            if not bias0.shape and bias0 == 0:
+                # No zero offset, propagate through, if possible
+                # (always possible if scale is scalar)
+                if conv_nobias:
+                    try:
+                        newscale = np.broadcast_to(_remove_redundant_dims(scale0), output.shape)  # check broadcastable
+                        newbias = np.zeros(output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+                elif not scale0.shape:
+                    # scalar scale0
+                    try:
+                        newscale = np.broadcast_to(scale0, output.shape)  # check broadcastable
+                        newbias = np.broadcast_to(bias * (1 - scale0), output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in0.name
+            model.remove_node(in0)
+
+        elif not aa0 and aa1 and not aa2:
+            # only weights have an ApplyAlpha
+            bias = in2.attributes['value'] if len(node.inputs) == 3 else 0
+            conv_nobias = np.all(bias == 0)
+
+            can_propagate = False
+            if not bias1.shape and bias1 == 0:
+                # No zero offset, propagate through, if possible
+                # (always possible if scale is scalar)
+                if conv_nobias:
+                    try:
+                        if scale1.ndim > 1:
+                            # undo any broadcast_to
+                            reduced_scale = _remove_redundant_dims(scale1)
+                            if reduced_scale.shape[-1] == 1:
+                                reduced_scale = reduced_scale[..., 0]
+                                if node.attributes['n_dim'] == 1:
+                                    scale_trans = np.transpose(reduced_scale, (1, 0))
+                                else:
+                                    scale_trans = np.transpose(reduced_scale, (1, 2, 0))
+                                newscale = np.broadcast_to(scale_trans, output.shape)  # make sure broadcastable
+                                can_propagate = True
+                        else:
+                            newscale = np.broadcast_to(scale1, output.shape)  # make sure broadcastable
+                            can_propagate = True
+                        newbias = np.zeros(output.shape)
+                    except ValueError:
+                        can_propagate = False
+                elif not scale1.shape:
+                    # scalar scale1
+                    try:
+                        newscale = np.broadcast_to(scale1, output.shape)  # check broadcastable
+                        newbias = np.broadcast_to(bias * (1 - scale1), output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in1.name
+            model.remove_node(in1)
+
+        elif not aa0 and not aa1 and aa2:
+            # only bias has a scale
+
+            can_propagate = False
+            if not scale2.shape and scale2 == 1:
+                # No scale, just additional bias
+                try:
+                    newscale = np.ones(output.shape)
+                    newbias = np.broadcast_to(bias2, output.shape)
+                    can_propagate = True
+                except ValueError:
+                    can_propagate = False
+
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in2.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in2.name
+            model.remove_node(in2)
+
+        elif aa0 and aa1 and not aa2:
+            # dataflow and weights have an ApplyAlpha
+            bias = in2.attributes['value'] if len(node.inputs) == 3 else 0
+            conv_nobias = np.all(bias == 0)
+
+            can_propagate = False
+            if not bias0.shape and bias0 == 0 and not bias1.shape and bias1 == 0:
+                # No zero offset, propagate through, if possible
+                # (always possible if scale is scalar)
+                if conv_nobias:
+                    try:
+                        if scale1.ndim > 1:
+                            # undo any broadcast_to
+                            reduced_scale0 = _remove_redundant_dims(scale0) if scale0.ndim > 1 else scale0
+                            reduced_scale1 = _remove_redundant_dims(scale1)
+                            reduced_scale = reduced_scale0 @ reduced_scale1
+                            if reduced_scale.shape[-1] == 1:
+                                reduced_scale = reduced_scale[..., 0]
+                                if node.attributes['n_dim'] == 1:
+                                    scale_trans = np.transpose(reduced_scale, (1, 0))
+                                else:
+                                    scale_trans = np.transpose(reduced_scale, (1, 2, 0))
+                                newscale = np.broadcast_to(scale_trans, output.shape)  # make sure broadcastable
+                                can_propagate = True
+                        elif scale0.ndim > 1:
+                            # scale1 is scalar
+                            # undo any broadcast_to
+                            reduced_scale0 = _remove_redundant_dims(scale0)
+                            reduced_scale = scale1 * reduced_scale0
+                            if reduced_scale.shape[-1] == 1:
+                                reduced_scale = reduced_scale[..., 0]
+                                if node.attributes['n_dim'] == 1:
+                                    scale_trans = np.transpose(reduced_scale, (1, 0))
+                                else:
+                                    scale_trans = np.transpose(reduced_scale, (1, 2, 0))
+                                newscale = np.broadcast_to(scale_trans, output.shape)  # make sure broadcastable
+                                can_propagate = True
+                        else:
+                            newscale = np.broadcast_to(scale0 * scale1, output.shape)  # make sure broadcastable
+                            can_propagate = True
+                        newbias = np.zeros(output.shape)
+                    except ValueError:
+                        can_propagate = False
+                elif not scale0.shape and not scale1.shape:
+                    # scalar scale1
+                    try:
+                        newscale = np.broadcast_to(scale0 * scale1, output.shape)  # check broadcastable
+                        newbias = np.broadcast_to(bias * (1 - scale0 * scale1), output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in1.name
+            model.remove_node(in0)
+            model.remove_node(in1)
+
+        elif aa0 and not aa1 and aa2:
+            # datapath and bias have a scale
+
+            can_propagate = False
+            if not bias0.shape and bias0 == 0 and not scale2.shape and not scale0.shape and scale2 == scale0:
+                # scalar scale0, no bais0 and scale2.
+                try:
+                    newscale = np.broadcast_to(scale0, output.shape)  # check broadcastable
+                    newbias = np.broadcast_to(bias2, output.shape)
+                    can_propagate = True
+                except ValueError:
+                    can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in0.name
+            model.remove_node(in0)
+            model.remove_node(in2)
+
+        elif not aa0 and aa1 and aa2:
+            # only weights and bias have an ApplyAlpha
+
+            can_propagate = False
+            if not bias1.shape and bias1 == 0 and not scale2.shape and not scale1.shape and scale2 == scale1:
+                # No zero offset, propagate through, if possible
+                # (always possible if scale is scalar)
+                if not scale1.shape:
+                    # scalar scale1
+                    try:
+                        newscale = np.broadcast_to(scale1, output.shape)  # check broadcastable
+                        newbias = np.broadcast_to(bias2, output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in1.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in1.name
+            model.remove_node(in1)
+            model.remove_node(in2)
+
+        elif aa0 and aa1 and aa2:
+            # have all
+
+            can_propagate = False
+            if (
+                not bias0.shape
+                and bias0 == 0
+                and not bias1.shape
+                and bias1 == 0
+                and not scale2.shape
+                and not scale1.shape
+                and not scale0.shape
+                and scale2 == scale1 * scale0
+            ):
+                # No zero offset, propagate through, if possible
+                # (always possible if scale is scalar)
+                if not scale1.shape:
+                    # scalar scale1
+                    try:
+                        newscale = np.broadcast_to(scale0 * scale1, output.shape)  # check broadcastable
+                        newbias = np.broadcast_to(bias2, output.shape)
+                        can_propagate = True
+                    except ValueError:
+                        can_propagate = False
+            if not can_propagate:
+                warnings.warn(
+                    'Failed to propagate quantization scales down Conv node; model probably not suppored.', stacklevel=1
+                )
+                return False
+
+            # to remove warning, since these get set again
+            new_attrs = {k: v for k, v in in0.attributes.items() if k not in ('trace', 'precision')}
+            new_name = in0.name
+            model.remove_node(in0)
+            model.remove_node(in1)
+            model.remove_node(in2)
+
+        # after the big if-else above
+        new_attrs['scale_data'] = newscale
+        new_attrs['bias_data'] = newbias
+
+        new_node = model.make_node('ApplyAlpha', new_name, new_attrs, [x for x in node.outputs])
+        model.insert_node(new_node)
+        return True
+
+
+def _remove_redundant_dims(X):
+    """This is somewhat of the inverse of broadcast-to. It sets the dimension size to 1 if all values are identical"""
+
+    shape = X.shape
+    for i in range(len(shape)):
+        reduced = np.expand_dims(np.take(X, 0, axis=i), axis=i)
+        if np.all(reduced == X):
+            X = reduced
+    return X
+
+
+def _make_scalar(scale, bias):
+    """Make the scale and bias scalar if possible"""
+    scale1d = np.ravel(scale)
+    if (scale1d[0] == scale).all():
+        # scalar scale
+        scale = np.array(scale1d[0])
+
+    bias1d = np.ravel(bias)
+    if (bias1d[0] == bias).all():
+        # scalar bias
+        bias = np.array(bias1d[0])
+
+    return scale, bias
diff --git a/hls4ml/model/optimizer/passes/nop.py b/hls4ml/model/optimizer/passes/nop.py
deleted file mode 100644
index 55fcf16e93..0000000000
--- a/hls4ml/model/optimizer/passes/nop.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from hls4ml.model.layers import Activation
-from hls4ml.model.optimizer import OptimizerPass
-
-
-class EliminateLinearActivation(OptimizerPass):
-    def match(self, node):
-        cast = False
-        if isinstance(node, Activation):
-            cast = node.get_input_variable().type.precision != node.get_output_variable().type.precision
-        return isinstance(node, Activation) and node.get_attr('activation') == 'linear' and not cast
-
-    def transform(self, model, node):
-        model.remove_node(node)
-        return True
diff --git a/hls4ml/model/optimizer/passes/qkeras.py b/hls4ml/model/optimizer/passes/qkeras.py
index ebc66fe59e..03690bed0d 100644
--- a/hls4ml/model/optimizer/passes/qkeras.py
+++ b/hls4ml/model/optimizer/passes/qkeras.py
@@ -1,7 +1,7 @@
 import numpy as np
 import tensorflow as tf
 
-from hls4ml.model.layers import BatchNormalization, register_layer
+from hls4ml.model.layers import ApplyAlpha
 from hls4ml.model.optimizer import ConfigurableOptimizerPass, OptimizerPass, register_pass
 from hls4ml.model.quantizers import QKerasPO2Quantizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -77,40 +77,11 @@ def precision_string_modify(self, pstr):
         return pstr
 
 
-class ApplyAlpha(BatchNormalization):
-    '''A custom layer to scale the output of a QDense layer which used 'alpha != 1'
-    Inference computation uses BatchNormalization methods'''
-
-    def initialize(self):
-        inp = self.get_input_variable()
-        shape = inp.shape
-        dims = inp.dim_names
-        self.add_output_variable(shape, dims)
-
-        scale = self.get_attr('scale_data')
-        scale_quantizer = self.get_attr('scale_quantizer')
-        bias = self.get_attr('bias_data')
-        bias_quantizer = self.get_attr('bias_quantizer')
-
-        self.add_weights(scale, quantizer=scale_quantizer)
-        self.add_bias(bias, quantizer=bias_quantizer)
-
-    def add_weights(self, scale, quantizer=None):
-        self.add_weights_variable(name='scale', var_name='s{index}', data=scale, quantizer=quantizer)
-
-    def add_bias(self, bias, quantizer=None):
-        self.add_weights_variable(name='bias', var_name='b{index}', data=bias, quantizer=quantizer)
-
-
 def register_qkeras():
-    # Register the layer types to the layer map
-    register_layer('ApplyAlpha', ApplyAlpha)
-
     # Register the optimization passes
     register_pass('output_rounding_saturation_mode', OutputRoundingSaturationMode)
     register_pass('qkeras_factorize_alpha', QKerasFactorizeAlpha)
     register_pass('extract_ternary_threshold', ExtractTernaryThreshold)
-    register_pass('fuse_consecutive_batch_normalization', FuseConsecutiveBatchNormalization)
 
 
 class QKerasFactorizeAlpha(OptimizerPass):
@@ -192,8 +163,16 @@ def transform(self, model, node):
         else:
             n_in = node.get_attr('n_out')
 
+        # the name of the new ApplyAlpha node
+        alpha_name = node.get_attr('name') + '_alpha'
+
+        # make the precision auto
+        alpha_precision = {'Precision': 'auto'}
+        model.config.set_name_config(alpha_name, alpha_precision)
+        model.config.parse_name_config(alpha_name, alpha_precision)
+
         attrs = {
-            'name': node.get_attr('name') + '_alpha',
+            'name': alpha_name,
             'class_name': 'Alpha',
             'inputs': node.outputs,
             'n_in': n_in,
@@ -210,38 +189,6 @@ def transform(self, model, node):
         return True
 
 
-class FuseConsecutiveBatchNormalization(OptimizerPass):
-    '''OptimizerPass to merge consecutive BatchNormalization layers.
-    These may exist in a model after QKerasFactorizeAlpha layer.
-    Scale and Bias of each layer are combined into scale and bias of a single layer.
-    '''
-
-    def match(self, node):
-        return isinstance(node, BatchNormalization) and isinstance(node.get_input_node(), BatchNormalization)
-
-    def transform(self, model, node):
-        bn0 = node.get_input_node()
-        bn1 = node
-        bn0_map = bn0.get_output_use_map()
-        bn1_map = bn1.get_output_use_map()
-        if len(bn0_map[bn0.name]) > 1 or len(bn1_map[bn1.name]) > 1:
-            return False
-
-        s0 = bn0.weights['scale'].data
-        b0 = bn0.weights['bias'].data
-        s1 = bn1.weights['scale'].data
-        b1 = bn1.weights['bias'].data
-
-        s2 = s0 * s1
-        b2 = s1 * b0 + b1
-
-        bn0.weights['scale'].data = s2
-        bn0.weights['bias'].data = b2
-
-        model.remove_node(node, rewire=True)
-        return True
-
-
 class ExtractTernaryThreshold(OptimizerPass):
     '''The input value (threshold) at which the output of a a ternary activation
     changes is configurable. This pass extracts that threshold point, inserting
diff --git a/hls4ml/model/optimizer/passes/quant_opt.py b/hls4ml/model/optimizer/passes/quant_opt.py
new file mode 100644
index 0000000000..04d5393748
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/quant_opt.py
@@ -0,0 +1,383 @@
+"""
+This file includes optimizations related to quant nodes.
+
+As a first step, QuantConstantParameters converts the extra inputs to attributes.
+
+The next step differs between the case of (1) (positive) power-of-2 scale and zero offset, or (2) other cases. In the first
+case no explicit scaling is required, so a Quant node logically becomes a linear activation. (Cases when the scale is a
+power of 2 not equal to one are implicitly scaled with fixed precision types.) When the activation is applied to a constant
+weight, the activation is immediately merged with the weight, quantizing the weights. In case (2), we need to explicitly
+scale and unscale, so the Quant node becomes 3 nodes, an ApplyAlpha node to apply a scale/shift, a Linear node to apply the
+quantization, and another ApplyAlpha to unscale/shift. We depend on optimization steps to move the unscaling ApplyAlpha
+down as needed so that we can do integer or fixed-point calculations. When the Quant is a applied to a weight, the scaling
+and Linear nodes are immediately merged into the Constant.
+
+"""
+
+import copy
+import math  # prefer to use math.ceil for scalar values
+
+import numpy as np
+
+from hls4ml.model.layers import Activation, ApplyAlpha, Constant, Quant
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.quantizers import QuantNodeQuantizer
+from hls4ml.model.types import FixedPrecisionType
+
+_ALSO_MATCH_PO2 = True
+
+
+class QuantConstantParameters(OptimizerPass):
+    """Remove Constant from the Qaunt node parameters (but not input[0])"""
+
+    def match(self, node):
+        is_match = (
+            isinstance(node, Quant)
+            and len(node.inputs) == 4
+            and (
+                (node.get_input_node(node.inputs[1]) and isinstance(node.get_input_node(node.inputs[1]), Constant))
+                or (node.get_input_node(node.inputs[2]) and isinstance(node.get_input_node(node.inputs[2]), Constant))
+                or (node.get_input_node(node.inputs[3]) and isinstance(node.get_input_node(node.inputs[3]), Constant))
+            )
+        )
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove Constant from the Quant node parameters (but not input[0])
+        """
+        if node.get_input_node(node.inputs[1]):
+            scale_node = node.get_input_node(node.inputs[1])
+            if isinstance(scale_node, Constant):
+                node.set_attr('scale', scale_node.get_attr('value'))
+                node.inputs[1] = ''
+                model.remove_node(scale_node, rewire=False)
+
+        if node.get_input_node(node.inputs[2]):
+            zeropt_node = node.get_input_node(node.inputs[2])
+            if isinstance(zeropt_node, Constant):
+                node.set_attr('zeropt', zeropt_node.get_attr('value'))
+                node.inputs[2] = ''
+                model.remove_node(zeropt_node, rewire=False)
+
+        if node.get_input_node(node.inputs[3]):
+            bitwidth_node = node.get_input_node(node.inputs[3])
+            if isinstance(bitwidth_node, Constant):
+                bitwidth = bitwidth_node.get_attr('value')
+                if bitwidth.size != 1:
+                    raise RuntimeError('Only scalar bitwidth values are supporeted by the Quant node')
+                node.set_attr('bitwidth', bitwidth[0])
+                node.inputs[3] = ''
+                model.remove_node(bitwidth_node, rewire=False)
+
+        node.inputs = [inp for inp in node.inputs if inp]
+        if len(node.inputs) != 1:
+            raise RuntimeError("hls4ml only supports constant scale, zeropt, and bitwidth values")
+
+        return True
+
+
+class QuantToActivation(OptimizerPass):
+    """
+    This is for the case when scale is a (positive) power of 2 and zeropt is 0. It is a a 1:1 transformation of
+    a Quant to an Activation.
+
+    As an optimization, this is not called when the input is constant.
+    """
+
+    def match(self, node):
+        # only matches after the other inputs are already folded
+
+        is_match = (
+            isinstance(node, Quant)
+            and len(node.inputs) == 1
+            and not isinstance(node.get_input_node(node.inputs[0]), Constant)
+        )
+
+        # Only match if the scale is power of 2 and the zero-point is 0s
+        if is_match:  # to make sure this is a quant node with inputs
+            scale = node.get_attr('scale')
+            bias = node.get_attr('zeropt')
+            is_match = is_match and (bias == np.zeros_like(bias)).all()
+
+            # check if scale is ones-like or a power of two
+            scale_unit_or_po2 = (scale == np.ones_like(scale)).all()
+            if not scale_unit_or_po2 and _ALSO_MATCH_PO2:
+                # This optimization only works if all scales are the same
+                if np.all(scale[0] == scale):
+                    mantissa, _ = np.frexp(scale[0])
+                    scale_unit_or_po2 = mantissa == 0.5
+
+            is_match = scale_unit_or_po2
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Change quant node to Activation
+        """
+
+        rounding_mode = node.get_attr('rounding_mode')
+        narrow = node.get_attr('narrow')
+        signed = node.get_attr('signed')
+        bitwidth = node.get_attr('bitwidth')
+        integer = bitwidth
+        scale = node.get_attr('scale')
+        if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all():
+            _, exp = np.frexp(scale[0])
+            integer = bitwidth + exp - 1
+
+        precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode)
+
+        attributes = {'activation': 'linear', 'quantizer': quantizer}
+
+        # update the configuration
+        config = model.config.get_layer_config(node)
+        prec_config = config.setdefault('Precision', {})
+        prec_config['result'] = str(precision)
+        new_name = f'{node.name}_act'
+        model.config.set_name_config(new_name, config)
+        model.config.parse_name_config(new_name, config)
+
+        new_node = model.make_node(Activation, new_name, attributes, [node.inputs[0]], [x for x in node.outputs])
+        model.replace_node(node, new_node)
+
+        return True
+
+
+class FuseQuantWithConstant(OptimizerPass):
+    """
+    This is for the case when scale is a positive power of 2 and zeropt is 0.
+    """
+
+    def match(self, node):
+        # only matches after the other inputs are already folded
+        is_match = (
+            isinstance(node, Quant) and len(node.inputs) == 1 and isinstance(node.get_input_node(node.inputs[0]), Constant)
+        )
+
+        # Only match if the scale is power of 2 and the zero-point is 0s
+        if is_match:  # to make sure this is a quant node with inputs
+            scale = node.get_attr('scale')
+            bias = node.get_attr('zeropt')
+            is_match = is_match and (bias == np.zeros_like(bias)).all()
+
+            # check if scale is ones-like or a power of two
+            scale_unit_or_po2 = (scale == np.ones_like(scale)).all()
+            if not scale_unit_or_po2 and _ALSO_MATCH_PO2:
+                # This optimization only works if all scales are the same
+                if np.all(scale.item(0) == scale):
+                    mantissa, _ = np.frexp(scale.item(0))
+                    scale_unit_or_po2 = mantissa == 0.5
+
+            is_match = scale_unit_or_po2
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Fuse Quant with Constant.
+        """
+
+        rounding_mode = node.get_attr('rounding_mode')
+        narrow = node.get_attr('narrow')
+        signed = node.get_attr('signed')
+        bitwidth = node.get_attr('bitwidth')
+        integer = bitwidth
+        scale = node.get_attr('scale')
+        if _ALSO_MATCH_PO2 and not (scale == np.ones_like(scale)).all():
+            _, exp = np.frexp(scale.item(0))  # know that np.all(scale.item(0) == scale) must be true
+            integer = bitwidth + exp - 1
+
+        precision, quantizer = _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode)
+
+        const_node = node.get_input_node(node.inputs[0])
+        const_node.set_attr('quantizer', quantizer)
+        const_node.get_output_variable().type.precision = precision
+
+        # Should we update the configuration to reflect the new precision? I don't think it's necessary
+
+        # remove the Quant node
+        model.remove_node(node, rewire=True)
+
+        return True
+
+
+class QuantToAlphaActivationAlpha(OptimizerPass):
+    """
+    This is for the case when scale is not power-of-2 or zeropt is not 0. It is a a 1:3 transformation of
+    a Quant to an ApplyAlpha (to scale), Activatio, ApplyAlpho (to rescale).
+
+    NOTE:  It needs to be scheduled after QuantToActivation (or we need to make the match criteria stricter)
+    """
+
+    def match(self, node):
+        # only matches after the other inputs are already folded
+        is_match = (
+            isinstance(node, Quant)
+            and len(node.inputs) == 1
+            and not isinstance(node.get_input_node(node.inputs[0]), Constant)
+        )
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Change quant node to ApplyAlhpa, Activation, ApplyAlpha
+        """
+
+        # Do the Activation as in the simple case
+
+        rounding_mode = node.get_attr('rounding_mode')
+        narrow = node.get_attr('narrow')
+        signed = node.get_attr('signed')
+        bitwidth = node.get_attr('bitwidth')
+
+        precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode)
+
+        activation_attributes = {'activation': 'linear', 'quantizer': quantizer}
+
+        # update the configuration
+        config = model.config.get_layer_config(node)
+        act_config = copy.deepcopy(config)
+        prec_config = act_config.setdefault('Precision', {})
+        prec_config['result'] = str(precision)
+        act_name = f'{node.name}_act'
+        model.config.set_name_config(act_name, act_config)
+        model.config.parse_name_config(act_name, act_config)
+
+        new_node = model.make_node(Activation, act_name, activation_attributes, [node.inputs[0]], [x for x in node.outputs])
+        model.replace_node(node, new_node)
+
+        # but now add the ApplyAlhpas before and after
+
+        inshape = node.get_input_variable().shape
+
+        scale = node.get_attr('scale')
+        bias = node.get_attr('zeropt')
+
+        attributes_scale = {'n_filt': -1}
+        attributes_rescale = {'n_filt': -1}
+
+        scale_config = copy.deepcopy(config)
+        scale_name = f'{node.name}_scale'
+        model.config.set_name_config(scale_name, scale_config)
+        model.config.parse_name_config(scale_name, scale_config)
+
+        rescale_config = config  # no need to deep copy the last
+        rescale_name = f'{node.name}_rescale'
+        model.config.set_name_config(rescale_name, rescale_config)
+        model.config.parse_name_config(rescale_name, rescale_config)
+
+        firstscale = 1 / scale
+        firstbias = bias
+        attributes_scale['scale_data'] = np.broadcast_to(firstscale, inshape)
+        attributes_scale['bias_data'] = np.broadcast_to(firstbias, inshape)
+
+        scale_node = model.make_node(ApplyAlpha, scale_name, attributes_scale, [node.inputs[0]])
+        model.insert_node(scale_node)
+
+        rescale = scale
+        rebias = -bias * scale
+        attributes_rescale['scale_data'] = np.broadcast_to(rescale, inshape)
+        attributes_rescale['bias_data'] = np.broadcast_to(rebias, inshape)
+
+        rescale_node = model.make_node(ApplyAlpha, rescale_name, attributes_rescale, [new_node.outputs[0]])
+        model.insert_node(rescale_node)
+
+        return True
+
+
+class ConstQuantToConstAlpha(OptimizerPass):
+    """
+    This is for the case when scale is not power-of-2 or zeropt is not 0. It is a a 1:3 transformation of
+    a Quant to an ApplyAlpha (to scale), Activation, ApplyAlpho (to unscale), but an input
+    consts allows for optimization, so the ApplyAlpha (to scale), Activation are
+    optimized away right away.
+    """
+
+    def match(self, node):
+        # only matches after the other inputs are already folded
+        is_match = (
+            isinstance(node, Quant) and len(node.inputs) == 1 and isinstance(node.get_input_node(node.inputs[0]), Constant)
+        )
+
+        if is_match:  # to make sure this is a quant node with inputs
+            scale = node.get_attr('scale')
+            bias = node.get_attr('zeropt')
+            is_match = is_match and ((scale != np.ones_like(scale)).any() or (bias != np.zeros_like(bias)).any())
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Change Constant + Quant node to Constant, ApplyAlpha
+        """
+
+        rounding_mode = node.get_attr('rounding_mode')
+        narrow = node.get_attr('narrow')
+        signed = node.get_attr('signed')
+        bitwidth = node.get_attr('bitwidth')
+
+        precision, quantizer = _calculate_precision_quantizer(bitwidth, bitwidth, signed, narrow, rounding_mode)
+
+        const_node = node.get_input_node(node.inputs[0])
+
+        scale = node.get_attr('scale')
+        bias = node.get_attr('zeropt')
+
+        # caclucate the new value
+        new_val = const_node.get_attr('value') / scale + bias
+        const_node.set_attr('value', new_val)
+        const_node.set_attr('quantizer', quantizer)
+
+        const_node.get_output_variable().type.precision = precision
+
+        inshape = node.get_input_variable().shape
+
+        attributes_rescale = {'n_filt': -1}
+
+        rescale_config = copy.deepcopy(model.config.get_layer_config(node))
+        rescale_name = f'{node.name}_rescale'
+        model.config.set_name_config(rescale_name, rescale_config)
+        model.config.parse_name_config(rescale_name, rescale_config)
+
+        rescale = scale
+        rebias = -bias * scale
+        attributes_rescale['scale_data'] = np.broadcast_to(rescale, inshape)
+        attributes_rescale['bias_data'] = np.broadcast_to(rebias, inshape)
+
+        rescale_node = model.make_node(
+            ApplyAlpha, rescale_name, attributes_rescale, [x for x in node.inputs], [x for x in node.outputs]
+        )
+        model.replace_node(node, rescale_node)
+
+        return True
+
+
+def _calculate_precision_quantizer(bitwidth, integer, signed, narrow, rounding_mode):
+    """
+    A function to determine the precision and quantizer
+    """
+    if rounding_mode == 'ROUND':
+        bn_round = 'AP_RND_CONV'
+    elif rounding_mode == 'FLOOR':
+        bn_round = 'AP_TRN'
+    else:
+        raise NotImplementedError(
+            f'Rounding mode {rounding_mode} not supported in Quant node. Only ROUND and FLOOR supported.'
+        )
+
+    if narrow and not signed:
+        raise NotImplementedError('Narrow mode is only supported for singed numbers.')
+
+    if narrow:
+        bn_sat = 'AP_SAT_SYM'
+    else:
+        bn_sat = 'AP_SAT'
+
+    bitwidth = math.ceil(bitwidth)
+    integer = math.ceil(integer)
+
+    precision = FixedPrecisionType(bitwidth, integer, signed, bn_round, bn_sat)
+    quantizer = QuantNodeQuantizer(precision)
+    return (precision, quantizer)
diff --git a/hls4ml/model/optimizer/passes/reshape_const.py b/hls4ml/model/optimizer/passes/reshape_const.py
new file mode 100644
index 0000000000..0012b2761e
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/reshape_const.py
@@ -0,0 +1,27 @@
+from hls4ml.model.layers import Constant, Reshape
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ReshapeConstant(OptimizerPass):
+    """
+    ONNX has the target shape come as an input, not a parameter. This removes
+    the Constant input from new shape input. (Non-constant inputs are not supported.)
+    The constant value was already used; this is just a cleanup uptimization.
+    """
+
+    def match(self, node):
+        is_match = isinstance(node, Reshape) and len(node.inputs) > 1 and node.get_input_node(node.inputs[1])
+
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove Constant from new shape input. Note, input shape node is already used on initialize
+        """
+        shape_node = node.get_input_node(node.inputs[1])
+        node.inputs[1] = ''
+        if not isinstance(shape_node, Constant):
+            raise RuntimeError("Nonconstant shape inputs are not currently supported")
+        model.remove_node(shape_node, rewire=False)
+
+        return True
diff --git a/hls4ml/model/optimizer/passes/resize_remove_constants.py b/hls4ml/model/optimizer/passes/resize_remove_constants.py
new file mode 100644
index 0000000000..69039c60a2
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/resize_remove_constants.py
@@ -0,0 +1,38 @@
+from warnings import warn
+
+from hls4ml.model.layers import Constant, Resize
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class ResizeRemoveConstants(OptimizerPass):
+    """
+    This optimizer is intended to clean the Resize node from RoI and Scales parameters that if left cause issues in hls4ml.
+    """
+
+    def match(self, node):
+        is_match = isinstance(node, Resize) and len(node.inputs) > 1
+        return is_match
+
+    def transform(self, model, node):
+        """
+        Remove RoI and Scale Constant from new shape input.
+        """
+        # see doc here: https://onnx.ai/onnx/operators/onnx__Resize.html
+        roi_index = 1
+        scales_idx = 2
+        scales_node = node.get_input_node(node.inputs[scales_idx])
+        node.inputs[scales_idx] = ''
+        if not isinstance(scales_node, Constant):
+            raise RuntimeError("Non-constant shape inputs are not supported")
+        model.remove_node(scales_node, rewire=False)
+        # RoI position is always 1 when present
+        roi_node = node.get_input_node(node.inputs[roi_index])
+        if roi_node.get_attr('value'):
+            warn('RoI value vector is not empty. Consider that RoI is not supported in hls4ml', stacklevel=2)
+        node.inputs[roi_index] = ''
+        if not isinstance(roi_node, Constant):
+            raise RuntimeError("Non-constant RoI inputs are not supported")
+        model.remove_node(roi_node, rewire=False)
+        # Clean all the '' inputs
+        node.inputs = list(filter(None, node.inputs))
+        return True
diff --git a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
index 38eef1e7d0..10840ec410 100644
--- a/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
+++ b/hls4ml/model/optimizer/passes/seperable_to_dw_conv.py
@@ -1,5 +1,5 @@
 """
-This optimizer converts a seperable convolution to a depthwise followed by a regular convolution.
+This optimizer converts a separable convolution to a depthwise followed by a regular convolution.
 For backends with a custom pointwise implementations the regular convolution will subsequently
 be converted to a pointwise convolution by a different optimizer.
 """
@@ -10,8 +10,8 @@
 from hls4ml.model.optimizer import OptimizerPass
 
 
-class SeperableToDepthwiseAndConv(OptimizerPass):
-    """Convert Seperable to DepthwiseConv + Conv (potentially later Pointwise)"""
+class SeparableToDepthwiseAndConv(OptimizerPass):
+    """Convert Separable to DepthwiseConv + Conv (potentially later Pointwise)"""
 
     _dw_attributes = (
         'in_width',
@@ -70,7 +70,7 @@ def transform(self, model, node):
             model.config.parse_name_config(dw_name, dw_layer_config)
 
         # creating the attributes
-        dw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._dw_attributes if k in node.attributes}
+        dw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._dw_attributes if k in node.attributes}
         dw_attributes['n_filt'] = dw_attributes['n_chan'] * dw_attributes['depth_multiplier']
         dw_attributes['use_bias'] = False
 
@@ -100,7 +100,7 @@ def transform(self, model, node):
             model.config.parse_name_config(pw_name, pw_layer_config)
 
         # creating the attributes
-        pw_attributes = {k: node.attributes[k] for k in SeperableToDepthwiseAndConv._pw_attributes if k in node.attributes}
+        pw_attributes = {k: node.attributes[k] for k in SeparableToDepthwiseAndConv._pw_attributes if k in node.attributes}
         pw_attributes['filt_width'] = 1
         pw_attributes['filt_height'] = 1
         pw_attributes['stride_width'] = 1
diff --git a/hls4ml/model/quantizers.py b/hls4ml/model/quantizers.py
index c857ef51ac..a5b9ceb8c4 100644
--- a/hls4ml/model/quantizers.py
+++ b/hls4ml/model/quantizers.py
@@ -8,7 +8,14 @@
 import tensorflow as tf
 from qkeras.quantizers import get_quantizer
 
-from hls4ml.model.types import ExponentPrecisionType, FixedPrecisionType, IntegerPrecisionType, XnorPrecisionType
+from hls4ml.model.types import (
+    ExponentPrecisionType,
+    FixedPrecisionType,
+    IntegerPrecisionType,
+    RoundingMode,
+    SaturationMode,
+    XnorPrecisionType,
+)
 
 
 class Quantizer:
@@ -158,3 +165,98 @@ def __call__(self, data):
         if hasattr(y, 'numpy'):
             y = y.numpy()
         return y
+
+
+class QuantNodeQuantizer(Quantizer):
+    """
+    This implements a quantizer for a FixedPrecisionType with width==integer
+
+    This is based on the sample implementation in finn-base
+    """
+
+    def __init__(self, precision):
+        super().__init__(precision.width, precision)
+        if not isinstance(precision, (FixedPrecisionType, IntegerPrecisionType)):
+            raise TypeError('QuantNodeQuantizer is only defined for FixedPrecisionType and IntegerPrecisionType')
+
+    def __call__(self, data):
+        """Apply the quantization on the data"""
+
+        scale = 2 ** (self.hls_type.width - self.hls_type.integer)
+
+        data = data * scale  # (not using *= to avoid modifying data)
+        # Clamping
+        min_int_val = self._min_int(self.hls_type.signed, self.hls_type.saturation_mode, self.bits)
+        max_int_val = self._max_int(self.hls_type.signed, self.bits)
+        data = np.where(data > max_int_val, max_int_val, data)
+        data = np.where(data < min_int_val, min_int_val, data)
+        # Rounding
+        rounding_fx = self._resolve_rounding_mode(self.hls_type.rounding_mode)
+        return rounding_fx(data) / scale
+
+    @staticmethod
+    def _min_int(signed: bool, saturation_mode: str, bit_width: int) -> int:
+        """Compute the minimum integer representable by a given number of bits.
+        Args:
+            signed (bool): Indicates whether the represented integer is signed or not.
+            saturation_mode (bool): Indicates the saturation mode used (AP_SAT_SYM or AP_SAT)
+            bit_width (int): Number of bits available for the representation.
+        Returns:
+            int: Maximum unsigned integer that can be represented according to
+            the input arguments.
+        Examples:
+            >>> min_int(signed=True, saturation_mode='AP_SAT_SYM', bit_width=8)
+            int(-127)
+            >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8)
+            int(0)
+            >>> min_int(signed=True, saturation_mode='AP_SAT', bit_width=8)
+            int(-128)
+            >>> min_int(signed=False, saturation_mode='AP_SAT_SYM', bit_width=8)
+            int(0)
+        """
+        if saturation_mode not in (SaturationMode.SAT_SYM, SaturationMode.SAT, SaturationMode.WRAP):
+            raise ValueError(
+                f'Saturation mode {saturation_mode} not supported. Only AP_SAT_SYM, AP_SAT supported, WRAP partially'
+            )
+        if signed and saturation_mode == SaturationMode.SAT_SYM:
+            value = -(2 ** (bit_width - 1)) + 1
+        elif signed:
+            value = -(2 ** (bit_width - 1))
+        else:
+            value = 0
+        return value
+
+    @staticmethod
+    def _max_int(signed: bool, bit_width: int) -> int:
+        """Compute the maximum integer representable by a given number of bits.
+        (Note, narrow and unsigned is not supported by the implementation, so saturation mode is not used)
+        Args:
+            signed (bool): Indicates whether the represented integer is signed or not.
+            bit_width (int): Number of bits available for the representation.
+        Returns:
+            Tensor: Maximum integer that can be represented according to
+            the input arguments.
+        Examples:
+            >>> max_int(signed=True, bit_width=8)
+            int(127)
+            >>> max_int(signed=False, bit_width=8)
+            int(255)
+        """
+        if not signed:
+            value = (2**bit_width) - 1
+        else:
+            value = (2 ** (bit_width - 1)) - 1
+        return value
+
+    @staticmethod
+    def _resolve_rounding_mode(mode):
+        """Resolve the rounding mode  of Quant and Trunc ops
+        to the corresponding numpy functions."""
+        if mode == RoundingMode.RND_CONV:
+            return np.round
+        # elif mode_string == 'CEIL':   # not supported
+        #     return np.ceil
+        elif mode == RoundingMode.TRN:
+            return np.floor
+        else:
+            raise ValueError(f'Rounding mode {mode} not supported.')
diff --git a/hls4ml/model/types.py b/hls4ml/model/types.py
index fb5cde3863..9d0a97440f 100644
--- a/hls4ml/model/types.py
+++ b/hls4ml/model/types.py
@@ -64,12 +64,15 @@ def __init__(self, width, signed):
         self.width = width
         self.signed = signed
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         eq = self.width == other.width
         eq = eq and self.signed == other.signed
 
         return eq
 
+    def __hash__(self) -> int:
+        return hash((self.width, self.signed))
+
 
 class IntegerPrecisionType(PrecisionType):
     """Arbitrary precision integer  data type.
@@ -88,12 +91,16 @@ def __str__(self):
         typestring = '{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width)
         return typestring
 
-    def __eq__(self, other):
+    # Does this need to make sure other is also an IntegerPrecisionType? I could see a match between Fixed and Integer
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, IntegerPrecisionType):
             return super().__eq__(other)
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__()
+
     @property
     def integer(self):
         return self.width
@@ -136,6 +143,8 @@ def __init__(self, width=16, integer=6, signed=True, rounding_mode=None, saturat
         self.saturation_mode = saturation_mode
         self.saturation_bits = saturation_bits
 
+    # make this a property to avoid inconsistencies
+
     @property
     def fractional(self):
         return self.width - self.integer
@@ -183,7 +192,7 @@ def __str__(self):
         typestring = '{signed}fixed<{args}>'.format(signed='u' if not self.signed else '', args=args)
         return typestring
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         if isinstance(other, FixedPrecisionType):
             eq = super().__eq__(other)
             eq = eq and self.integer == other.integer
@@ -194,6 +203,9 @@ def __eq__(self, other):
 
         return False
 
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash((self.integer, self.rounding_mode, self.saturation_mode, self.saturation_bits))
+
 
 class XnorPrecisionType(PrecisionType):
     """
@@ -204,6 +216,7 @@ def __init__(self):
         super().__init__(width=1, signed=False)
         self.integer = 1
 
+    # TODO:  this should really be a specific type
     def __str__(self):
         typestring = 'uint<1>'
         return typestring
@@ -218,6 +231,7 @@ class ExponentPrecisionType(PrecisionType):
     def __init__(self, width=16, signed=True):
         super().__init__(width=width, signed=signed)
 
+    # TODO:  this should really be a specific type, not int
     def __str__(self):
         typestring = '{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width)
         return typestring
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
index ab51ce1eb3..c626b70c2b 100644
--- a/hls4ml/optimization/__init__.py
+++ b/hls4ml/optimization/__init__.py
@@ -1,108 +1,3 @@
-import numpy as np
-
-from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
-from hls4ml.optimization.keras import optimize_model
-
-default_regularization_range = np.logspace(-6, -2, num=16).tolist()
-
-
-def optimize_keras_model_for_hls4ml(
-    keras_model,
-    hls_config,
-    objective,
-    scheduler,
-    X_train,
-    y_train,
-    X_val,
-    y_val,
-    batch_size,
-    epochs,
-    optimizer,
-    loss_fn,
-    validation_metric,
-    increasing,
-    rtol,
-    callbacks=None,
-    ranking_metric='l1',
-    local=False,
-    verbose=False,
-    rewinding_epochs=1,
-    cutoff_bad_trials=3,
-    directory='hls4ml-optimization',
-    tuner='Bayesian',
-    knapsack_solver='CBC_MIP',
-    regularization_range=default_regularization_range,
-):
-    '''
-    Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
-
-    Args:
-        keras_model (keras.Model): Model to be optimized
-        hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
-        objective (hls4ml.optimization.objectives.ObjectiveEstimator):
-        Parameter, hardware or user-defined objective of optimization
-        scheduler (hls4ml.optimization.scheduler.OptimizationScheduler):
-        Sparsity scheduler, choose between constant, polynomial and binary
-        X_train (np.array): Training inputs
-        y_train (np.array): Training labels
-        X_val (np.array): Validation inputs
-        y_val (np.array): Validation labels
-        batch_size (int): Batch size during training
-        epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
-        optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
-        loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
-        validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
-        increasing (boolean): If the metric improves with increased values;
-            e.g. accuracy -> increasing = True, MSE -> increasing = False
-        rtol (float): Relative tolerance;
-            pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
-        callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
-        ranking_metric (string): Metric used for ranking weights and structures;
-            currently supported l1, l2, saliency and Oracle
-        local (boolean): Layer-wise or global pruning
-        verbose (boolean): Display debug logs during model optimization
-        rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
-            allows regrowth of previously pruned weights
-        cutoff_bad_trials (int): After how many bad trials (performance below threshold),
-            should model pruning / weight sharing stop
-        directory (string): Directory to store temporary results
-        tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None
-        knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
-            default usually works well; for very large networks, greedy algorithm might be more suitable
-        regularization_range (list): List of suitable hyperparameters for weight decay
-
-    Returns:
-        keras.Model: Optimized model
-    '''
-
-    # Extract model attributes
-    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config)
-
-    # Optimize model
-    return optimize_model(
-        keras_model,
-        model_attributes,
-        objective,
-        scheduler,
-        X_train,
-        y_train,
-        X_val,
-        y_val,
-        batch_size,
-        epochs,
-        optimizer,
-        loss_fn,
-        validation_metric,
-        increasing,
-        rtol,
-        callbacks=callbacks,
-        ranking_metric=ranking_metric,
-        local=local,
-        verbose=verbose,
-        rewinding_epochs=rewinding_epochs,
-        cutoff_bad_trials=cutoff_bad_trials,
-        directory=directory,
-        tuner=tuner,
-        knapsack_solver=knapsack_solver,
-        regularization_range=regularization_range,
-    )
+from .dsp_aware_pruning import optimize_keras_model_for_hls4ml  # noqa: F401
+from .dsp_aware_pruning.attributes import get_attributes_from_keras_model_and_hls4ml_config  # noqa: F401
+from .dsp_aware_pruning.keras import optimize_model  # noqa: F401
diff --git a/hls4ml/optimization/dsp_aware_pruning/__init__.py b/hls4ml/optimization/dsp_aware_pruning/__init__.py
new file mode 100644
index 0000000000..69e2029e0e
--- /dev/null
+++ b/hls4ml/optimization/dsp_aware_pruning/__init__.py
@@ -0,0 +1,108 @@
+import numpy as np
+
+from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.optimization.dsp_aware_pruning.keras import optimize_model
+
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
+
+def optimize_keras_model_for_hls4ml(
+    keras_model,
+    hls_config,
+    objective,
+    scheduler,
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    batch_size,
+    epochs,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    rtol,
+    callbacks=None,
+    ranking_metric='l1',
+    local=False,
+    verbose=False,
+    rewinding_epochs=1,
+    cutoff_bad_trials=3,
+    directory='hls4ml-optimization',
+    tuner='Bayesian',
+    knapsack_solver='CBC_MIP',
+    regularization_range=default_regularization_range,
+):
+    '''
+    Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
+
+    Args:
+        keras_model (keras.Model): Model to be optimized
+        hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
+        objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        Parameter, hardware or user-defined objective of optimization
+        scheduler (hls4ml.optimization.scheduler.OptimizationScheduler):
+        Sparsity scheduler, choose between constant, polynomial and binary
+        X_train (np.array): Training inputs
+        y_train (np.array): Training labels
+        X_val (np.array): Validation inputs
+        y_val (np.array): Validation labels
+        batch_size (int): Batch size during training
+        epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+        optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+        loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+        validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+        increasing (boolean): If the metric improves with increased values;
+            e.g. accuracy -> increasing = True, MSE -> increasing = False
+        rtol (float): Relative tolerance;
+            pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+        callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+        ranking_metric (string): Metric used for ranking weights and structures;
+            currently supported l1, l2, saliency and Oracle
+        local (boolean): Layer-wise or global pruning
+        verbose (boolean): Display debug logs during model optimization
+        rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+            allows regrowth of previously pruned weights
+        cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+            should model pruning / weight sharing stop
+        directory (string): Directory to store temporary results
+        tuner (str): Tuning algorithm, choose between Bayesian, Hyperband and None
+        knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+            default usually works well; for very large networks, greedy algorithm might be more suitable
+        regularization_range (list): List of suitable hyperparameters for weight decay
+
+    Returns:
+        keras.Model: Optimized model
+    '''
+
+    # Extract model attributes
+    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config)
+
+    # Optimize model
+    return optimize_model(
+        keras_model,
+        model_attributes,
+        objective,
+        scheduler,
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        batch_size,
+        epochs,
+        optimizer,
+        loss_fn,
+        validation_metric,
+        increasing,
+        rtol,
+        callbacks=callbacks,
+        ranking_metric=ranking_metric,
+        local=local,
+        verbose=verbose,
+        rewinding_epochs=rewinding_epochs,
+        cutoff_bad_trials=cutoff_bad_trials,
+        directory=directory,
+        tuner=tuner,
+        knapsack_solver=knapsack_solver,
+        regularization_range=regularization_range,
+    )
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/dsp_aware_pruning/attributes.py
similarity index 98%
rename from hls4ml/optimization/attributes.py
rename to hls4ml/optimization/dsp_aware_pruning/attributes.py
index a7b6d74135..f652f27d50 100644
--- a/hls4ml/optimization/attributes.py
+++ b/hls4ml/optimization/dsp_aware_pruning/attributes.py
@@ -2,8 +2,8 @@
 
 import hls4ml
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.config import SUPPORTED_LAYERS
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS
 
 
 class hls4mlAttributes:
diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/dsp_aware_pruning/config.py
similarity index 100%
rename from hls4ml/optimization/config.py
rename to hls4ml/optimization/dsp_aware_pruning/config.py
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
similarity index 96%
rename from hls4ml/optimization/keras/__init__.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
index d67ddd5d26..29012bd39e 100644
--- a/hls4ml/optimization/keras/__init__.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/__init__.py
@@ -7,13 +7,13 @@
 # Enables printing of loss tensors during custom training loop
 from tensorflow.python.ops.numpy_ops import np_config
 
-import hls4ml.optimization.keras.utils as utils
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers
-from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
-from hls4ml.optimization.keras.masking import get_model_masks
-from hls4ml.optimization.keras.reduction import reduce_model
-from hls4ml.optimization.scheduler import OptimizationScheduler
+import hls4ml.optimization.dsp_aware_pruning.keras.utils as utils
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.builder import build_optimizable_model, remove_custom_regularizers
+from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
+from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks
+from hls4ml.optimization.dsp_aware_pruning.keras.reduction import reduce_model
+from hls4ml.optimization.dsp_aware_pruning.scheduler import OptimizationScheduler
 
 np_config.enable_numpy_behavior()
 default_regularization_range = np.logspace(-6, -2, num=16).tolist()
diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/dsp_aware_pruning/keras/builder.py
similarity index 98%
rename from hls4ml/optimization/keras/builder.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/builder.py
index f265ccdf48..4ba39e4f7b 100644
--- a/hls4ml/optimization/keras/builder.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/builder.py
@@ -8,8 +8,8 @@
 from tensorflow.keras.callbacks import EarlyStopping
 from tensorflow.keras.layers import Conv2D, Dense
 
-from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY
-from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
+from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY
+from hls4ml.optimization.dsp_aware_pruning.keras.regularizers import Conv2DRegularizer, DenseRegularizer
 
 co = {}
 _add_supported_quantized_objects(co)
diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/dsp_aware_pruning/keras/config.py
similarity index 100%
rename from hls4ml/optimization/keras/config.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/config.py
diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/dsp_aware_pruning/keras/masking.py
similarity index 99%
rename from hls4ml/optimization/keras/masking.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/masking.py
index 0e74997be8..dddeddf6f7 100644
--- a/hls4ml/optimization/keras/masking.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/masking.py
@@ -6,9 +6,9 @@
 from qkeras import QConv2D, QDense
 from tensorflow.keras.layers import Conv2D, Dense
 
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS
-from hls4ml.optimization.knapsack import solve_knapsack
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS
+from hls4ml.optimization.dsp_aware_pruning.knapsack import solve_knapsack
 
 
 def get_model_masks(
diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/dsp_aware_pruning/keras/reduction.py
similarity index 96%
rename from hls4ml/optimization/keras/reduction.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/reduction.py
index 4ea8855aa8..12fb534799 100644
--- a/hls4ml/optimization/keras/reduction.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/reduction.py
@@ -2,7 +2,7 @@
 from tensorflow.keras.layers import Conv2D, Dense
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.keras.utils import get_last_layer_with_weights
+from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_last_layer_with_weights
 
 
 def reduce_model(model):
diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py
similarity index 99%
rename from hls4ml/optimization/keras/regularizers.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py
index 1e885963c2..b42eb3f056 100644
--- a/hls4ml/optimization/keras/regularizers.py
+++ b/hls4ml/optimization/dsp_aware_pruning/keras/regularizers.py
@@ -1,7 +1,7 @@
 import numpy as np
 import tensorflow as tf
 
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
 
 
 @tf.keras.utils.register_keras_serializable(name='DenseRegularizer')
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/dsp_aware_pruning/keras/utils.py
similarity index 100%
rename from hls4ml/optimization/keras/utils.py
rename to hls4ml/optimization/dsp_aware_pruning/keras/utils.py
diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/dsp_aware_pruning/knapsack.py
similarity index 100%
rename from hls4ml/optimization/knapsack.py
rename to hls4ml/optimization/dsp_aware_pruning/knapsack.py
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py
similarity index 97%
rename from hls4ml/optimization/objectives/__init__.py
rename to hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py
index fcbef305b6..45204aaf73 100644
--- a/hls4ml/optimization/objectives/__init__.py
+++ b/hls4ml/optimization/dsp_aware_pruning/objectives/__init__.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 
-from hls4ml.optimization.attributes import OptimizationAttributes
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
 
 '''
 Pruning & weight sharing are formulated as an optimization problem, with the aim of minimizing some metric
diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py
similarity index 92%
rename from hls4ml/optimization/objectives/gpu_objectives.py
rename to hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py
index 8528a31839..bb3afc6397 100644
--- a/hls4ml/optimization/objectives/gpu_objectives.py
+++ b/hls4ml/optimization/dsp_aware_pruning/objectives/gpu_objectives.py
@@ -2,9 +2,9 @@
 
 import numpy as np
 
-from hls4ml.optimization.attributes import OptimizationAttributes
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator
 
 
 class GPUFLOPEstimator(ObjectiveEstimator):
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
similarity index 98%
rename from hls4ml/optimization/objectives/vivado_objectives.py
rename to hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
index 1e1a0a9792..9374f4aef8 100644
--- a/hls4ml/optimization/objectives/vivado_objectives.py
+++ b/hls4ml/optimization/dsp_aware_pruning/objectives/vivado_objectives.py
@@ -3,9 +3,9 @@
 
 import numpy as np
 
-from hls4ml.optimization.attributes import OptimizationAttributes
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.dsp_aware_pruning.attributes import OptimizationAttributes
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator
 
 
 # Optimizes DSP utilisation for Vivado backend
diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/dsp_aware_pruning/scheduler.py
similarity index 100%
rename from hls4ml/optimization/scheduler.py
rename to hls4ml/optimization/dsp_aware_pruning/scheduler.py
diff --git a/hls4ml/templates/catapult/myproject_bridge.cpp b/hls4ml/templates/catapult/myproject_bridge.cpp
index f1326a1faf..9937adcf89 100755
--- a/hls4ml/templates/catapult/myproject_bridge.cpp
+++ b/hls4ml/templates/catapult/myproject_bridge.cpp
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include <map>
 
-static std::string s_weights_dir = "weights";
+// hls-fpga-machine-learning insert weights dir
 
 const char *get_weights_dir() { return s_weights_dir.c_str(); }
 
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
index 00c2cf5e12..9cba030710 100644
--- a/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_merge.h
@@ -11,6 +11,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
index c76bfba5a6..ec2e9bfb1a 100644
--- a/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
+++ b/hls4ml/templates/catapult/nnet_utils/nnet_stream.h
@@ -41,6 +41,26 @@ void clone_stream(ac_channel<data_T> &data, ac_channel<res_T> &res1, ac_channel<
     }
 }
 
+template <class data_T, class res_T, int N>
+void clone_stream(ac_channel<data_T> &data, ac_channel<res_T> &res1, ac_channel<res_T> &res2, ac_channel<res_T> &res3) {
+#ifndef __SYNTHESIS__
+    while (data.available(1))
+#endif
+    {
+        data_T in_data = data.read();
+        res_T out_data;
+
+    ClonePack:
+        for (int j = 0; j < data_T::size; j++) {
+            out_data[j] = in_data[j];
+        }
+
+        res1.write(out_data);
+        res2.write(out_data);
+        res3.write(out_data);
+    }
+}
+
 template <class data_T, class res_T, int N> void repack_stream(ac_channel<data_T> &data, ac_channel<res_T> &res) {
     if (data_T::size == res_T::size) {
         for (int i = 0; i < N / data_T::size; i++) {
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
new file mode 100644
index 0000000000..d2c774fcf8
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV1D_H_
+#define NNET_DEPTH_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d.h"
+#include "nnet_depthconv1d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
new file mode 100644
index 0000000000..c06b6b14e7
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv1d_resource.h
@@ -0,0 +1,60 @@
+#ifndef NNET_DEPTH_CONV1D_LATENCY_H_
+#define NNET_DEPTH_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_1d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    WIDTH_LOOP:
+        #pragma unroll
+        for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+        CHAN_LOOP:
+            #pragma unroll
+            for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                res_idx = (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                acc[res_idx] = biases[c * depth_multiplier + dm];
+
+            KERNEL_W_LOOP:
+                #pragma unroll
+                for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                    int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                    if ((w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                        acc[res_idx] += CONFIG_T::mult_config::
+                            template product<typename data_T::value_type, typename CONFIG_T::weight_t::value_type>::product(
+                                data[(w_in)*CONFIG_T::n_chan + c],
+                                weights[(dm * CONFIG_T::filt_width * CONFIG_T::n_chan) + (kw * CONFIG_T::n_chan) + c]);
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
new file mode 100644
index 0000000000..87dc1805d9
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d.h
@@ -0,0 +1,19 @@
+#ifndef NNET_DEPTH_CONV2D_H_
+#define NNET_DEPTH_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d.h"
+#include "nnet_depthconv2d_resource.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                          const typename CONFIG_T::bias_t &biases) {
+
+    depthwise_conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
new file mode 100644
index 0000000000..91ddc28f65
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_depthconv2d_resource.h
@@ -0,0 +1,76 @@
+#ifndef NNET_SEPARABLE_CONV2D_LATENCY_H_
+#define NNET_SEPARABLE_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_resource.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void depthwise_conv_2d_resource_cl(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                                   const typename CONFIG_T::bias_t &biases) {
+
+    int depth_multiplier = CONFIG_T::n_filt / CONFIG_T::n_chan;
+    [[intel::fpga_register]] int res_idx = 0;
+
+    [[intel::fpga_register]] typename CONFIG_T::accum_t acc[CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt];
+
+DM_LOOP:
+    #pragma unroll
+    for (int dm = 0; dm < depth_multiplier; dm++) {
+
+    HEIGHT_LOOP:
+        #pragma unroll
+        for (int h = 0; h < CONFIG_T::out_height; h++) {
+        WIDTH_LOOP:
+            #pragma unroll
+            for (int w = 0; w < CONFIG_T::out_width; w++) {
+
+            CHAN_LOOP:
+                #pragma unroll
+                for (int c = 0; c < CONFIG_T::n_chan; c++) {
+
+                    res_idx =
+                        (h * CONFIG_T::out_width * CONFIG_T::n_filt) + (w * CONFIG_T::n_filt) + (c * depth_multiplier) + dm;
+
+                    acc[res_idx] = biases[c * depth_multiplier + dm];
+
+                KERNEL_H_LOOP:
+                    #pragma unroll
+                    for (int kh = 0; kh < CONFIG_T::filt_height; kh++) {
+                    KERNEL_W_LOOP:
+                        #pragma unroll
+                        for (int kw = 0; kw < CONFIG_T::filt_width; kw++) {
+
+                            int h_in = h * CONFIG_T::stride_height + kh - CONFIG_T::pad_top;
+                            int w_in = w * CONFIG_T::stride_width + kw - CONFIG_T::pad_left;
+
+                            if ((h_in >= 0) && (h_in < CONFIG_T::in_height) && (w_in >= 0) && (w_in < CONFIG_T::in_width)) {
+
+                                acc[res_idx] +=
+                                    CONFIG_T::mult_config::template product<typename data_T::value_type,
+                                                                            typename CONFIG_T::weight_t::value_type>::
+                                        product(
+                                            data[(h_in)*CONFIG_T::in_width * CONFIG_T::n_chan + (w_in)*CONFIG_T::n_chan + c],
+                                            weights[(dm * CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kh * CONFIG_T::filt_width * CONFIG_T::n_chan) +
+                                                    (kw * CONFIG_T::n_chan) + c]);
+
+                                ;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+RESULT:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::out_width * CONFIG_T::out_height * CONFIG_T::n_filt; ires++) {
+        res[ires] = cast<typename CONFIG_T::accum_t, typename res_T::value_type, CONFIG_T>(acc[ires]);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
index 550663b881..d1262f4377 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_merge.h
@@ -7,6 +7,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
index 961c65037d..9dbbd92425 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv2d_resource.h
@@ -1,6 +1,8 @@
 #ifndef NNET_CONV2D_RESOURCE_H_
 #define NNET_CONV2D_RESOURCE_H_
 
+#include <cstdint>
+
 #include "nnet_common.h"
 #include "nnet_dense.h"
 #include "nnet_helpers.h"
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
index 766ef2e208..1ee9a9f564 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
@@ -7,6 +7,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 52a404672c..46beeacb03 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -38,11 +39,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -55,13 +52,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     //#pragma HLS INLINE recursive
 
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        //#pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index 1bf25cc89c..e166cdd470 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -85,5 +85,83 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
index dd488ace5b..e3e53d1869 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_resource.h
@@ -94,7 +94,8 @@ void conv_1d_resource_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
         ResultLoop:
             for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
                 #pragma HLS UNROLL
-                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+                res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
+                    cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
             }
         }
     }
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
index e0d30da34b..e427bd708d 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d_resource.h
@@ -97,7 +97,8 @@ void conv_2d_resource_cl(
         ResultLoop:
             for (unsigned i_res = 0; i_res < mult_n_out; i_res++) {
                 #pragma HLS UNROLL
-                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
+                res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
+                    cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_pxl][i_res]);
             }
         }
     }
diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh
index 8b2daf185f..df719e2305 100755
--- a/hls4ml/templates/vivado/build_lib.sh
+++ b/hls4ml/templates/vivado/build_lib.sh
@@ -11,9 +11,10 @@ LDFLAGS=
 INCFLAGS="-Ifirmware/ap_types/"
 PROJECT=myproject
 LIB_STAMP=mystamp
-WEIGHTS_DIR="\"weights\""
+BASEDIR="$(cd "$(dirname "$0")" && pwd)"
+WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\""
 
-${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
-${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR=${WEIGHTS_DIR} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
 ${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
 rm -f *.o
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index af37b0f4aa..05d4b8a4d5 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -161,7 +161,7 @@ if {$opt(reset)} {
 } else {
     open_solution "solution1"
 }
-catch {config_array_partition -maximum_size 4096}
+catch {config_array_partition -maximum_size $maximum_size}
 config_compile -name_max_length 80
 set_part $part
 config_schedule -enable_dsp_full_reg=false
@@ -236,13 +236,13 @@ if {$opt(export)} {
 
 if {$opt(vsynth)} {
     puts "***** VIVADO SYNTHESIS *****"
-    if {[file exist ${project_name}_prj/solution1/syn/vhdl]} {
+    if {[file exist ${project_name}_prj/solution1/syn/verilog]} {
         set time_start [clock clicks -milliseconds]
         exec vivado -mode batch -source vivado_synth.tcl >@ stdout
         set time_end [clock clicks -milliseconds]
         report_time "VIVADO SYNTHESIS" $time_start $time_end
     } else {
-        puts "ERROR: Cannot find generated VHDL files. Did you run C synthesis?"
+        puts "ERROR: Cannot find generated Verilog files. Did you run C synthesis?"
         exit 1
     }
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
index e4db43682e..6011e20cca 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_code_gen.h
@@ -1,26 +1,22 @@
 #ifndef NNET_INSTR_GEN_H_
 #define NNET_INSTR_GEN_H_
 
+#include "nnet_conv1d_latency.h"
 #include "nnet_helpers.h"
-#include <iostream>
 
-namespace nnet {
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_function_stubs.h"
+#include "nnet_mult.h"
 
-template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
-  public:
-    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
-                            const unsigned partition) {
-        // To be implemented in subclasses
-    }
-};
+namespace nnet {
 
-template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
   public:
-    static void
-    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
-                const unsigned partition) {
+    static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
         // To be implemented in subclasses
     }
 };
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index fed0395a1a..6db3f62f6e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -2,6 +2,7 @@
 #define NNET_COMMON_H_
 
 #include "ap_fixed.h"
+#include "nnet_helpers.h"
 
 // This is a substitute for "ceil(n/(float)d)".
 #define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
@@ -23,7 +24,7 @@ namespace nnet {
 
 // Common type definitions
 enum io_type { io_parallel = 0, io_stream };
-enum strategy { latency, resource };
+enum strategy { latency, resource, resource_unrolled };
 
 /* ---
  * Balanced tree reduce implementation.
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index e2e0211b49..72bce78067 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_conv1d_latency.h"
 #include "nnet_conv1d_resource.h"
+#include "nnet_function_stubs.h"
 #include <cstdlib>
 
 namespace nnet {
@@ -37,11 +38,7 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     #pragma HLS INLINE region
 
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -53,13 +50,28 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
     #pragma HLS INLINE region
 
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
+    CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DLatency : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : public Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        #pragma HLS INLINE region
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index 0d9afb10cb..ef2f94dcaf 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -84,5 +84,83 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     }
 }
 
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    #pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                #pragma HLS UNROLL
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            #pragma HLS UNROLL
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
+        }
+    }
+}
+
 } // namespace nnet
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index b23c330c78..2b481930b7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -60,6 +60,10 @@ void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
 
+    if (CONFIG_T::strategy == nnet::resource_unrolled && CONFIG_T::reuse_factor > 1) {
+        #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function
+    }
+
 ReadInputWidth:
     for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
         #pragma HLS LOOP_FLATTEN
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
index 8a4fb6be81..1408b0db13 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_stream.h
@@ -75,6 +75,11 @@ void conv_2d_buffer_cl(
                                                                                     [CONFIG_T::n_chan];
     #pragma HLS ARRAY_PARTITION variable = line_buffer complete dim = 2
 
+    if (CONFIG_T::strategy == nnet::resource_unrolled && CONFIG_T::reuse_factor > 1) {
+        #pragma HLS allocation instances=compute_output_buffer_1d limit=1 function
+        #pragma HLS allocation instances=compute_output_buffer_2d limit=1 function
+    }
+
 ReadInputHeight:
     for (unsigned i_ih = 0; i_ih < CONFIG_T::in_height; i_ih++) {
     ReadInputWidth:
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
index 7bd47442f6..dcd914dffe 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv_stream.h
@@ -95,13 +95,8 @@ void mult_buffer(hls::stream<typename data_T::value_type> data_window[CONFIG_T::
     }
 
     #pragma HLS INLINE recursive
-    if (CONFIG_T::strategy == nnet::latency) {
-        dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-            data, res, weights, biases);
-    } else {
-        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-            data, res, weights, biases);
-    }
+    CONFIG_T::mult_config::template kernel<typename data_T::value_type, typename res_T::value_type,
+                                           typename CONFIG_T::mult_config>::dense(data, res, weights, biases);
 
 CastLoop:
     for (unsigned jj = 0; jj < CONFIG_T::n_filt; jj++) {
@@ -290,13 +285,8 @@ void compute_output_buffer_2d(
 
         // Dense multiply
         // #pragma HLS INLINE recursive
-        if (CONFIG_T::strategy == nnet::latency) {
-            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-                kernel_data, res_out, weights, biases);
-        } else {
-            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-                kernel_data, res_out, weights, biases);
-        }
+        CONFIG_T::mult_config::template kernel<typename data_T::value_type, typename res_T::value_type,
+                                               typename CONFIG_T::mult_config>::dense(kernel_data, res_out, weights, biases);
 
     // Pack output
     CastLoop:
@@ -335,7 +325,7 @@ void compute_output_buffer_1d(
     const data_T &in_elem, hls::stream<res_T> &res_stream,
     typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS INLINE
+    #pragma HLS INLINE OFF
 
     // Thresholds
     const static int lShiftX = CONFIG_T::filt_width - 1;
@@ -360,14 +350,9 @@ void compute_output_buffer_1d(
     if ((sX - lShiftX) == 0 && pX > lShiftX - 1) {
 
         // Dense multiply
-        #pragma HLS INLINE recursive
-        if (CONFIG_T::strategy == nnet::latency) {
-            dense_latency<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-                kernel_data, res_out, weights, biases);
-        } else {
-            dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(
-                kernel_data, res_out, weights, biases);
-        }
+        // #pragma HLS INLINE recursive
+        CONFIG_T::mult_config::template kernel<typename data_T::value_type, typename res_T::value_type,
+                                               typename CONFIG_T::mult_config>::dense(kernel_data, res_out, weights, biases);
 
     // Pack output
     CastLoop:
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index c5155d8485..d6c7beb70e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -5,6 +5,7 @@
 #include "nnet_common.h"
 #include "nnet_dense_latency.h"
 #include "nnet_dense_resource.h"
+#include "nnet_function_stubs.h"
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
 #include <math.h>
@@ -27,7 +28,11 @@ struct dense_config {
     static const unsigned reuse_factor = 1;
     static const bool store_weights_in_bram = false;
     static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
+
+    template <class data_T, class res_T, class CONFIG_T> using kernel = nnet::DenseKernel<data_T, res_T, CONFIG_T>;
+
+    // Partitioning arrays cyclically to go with roll factors?
+
     // Product function to use
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
@@ -36,13 +41,41 @@ template <class data_T, class res_T, typename CONFIG_T>
 void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
            typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
            typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    #pragma HLS inline
-    if (CONFIG_T::strategy == nnet::latency) {
+    #pragma HLS INLINE
+    CONFIG_T::template kernel<data_T, res_T, CONFIG_T>::dense(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class DenseLatency : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS INLINE
         dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
-}
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+class DenseResource_rf_leq_nin : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS INLINE
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+class DenseResource_rf_gt_nin_rem0 : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS INLINE
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
 
 } // namespace nnet
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index 88de94729b..333a0e75fe 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -26,10 +26,13 @@ void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::
     assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
 
     #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
     #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
+    if (CONFIG_T::reuse_factor > 1) {
+        #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    }
+
     typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
     #pragma HLS ARRAY_PARTITION variable=acc complete
 
@@ -97,10 +100,13 @@ void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG
     assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
 
     #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
     #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
+    if (CONFIG_T::reuse_factor > 1) {
+        #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    }
+
     typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
     #pragma HLS ARRAY_PARTITION variable=acc complete
 
@@ -176,10 +182,13 @@ void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n
     assert((rufactor > nin) && "This function is correct only for RF > N_IN");
 
     #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
     #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
     #pragma HLS ARRAY_PARTITION variable=biases complete
 
+    if (CONFIG_T::reuse_factor > 1) {
+        #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    }
+
     typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
     #pragma HLS ARRAY_PARTITION variable=acc complete
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
index ad3a972ef6..3e3183480e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_stream.h
@@ -16,10 +16,8 @@ void dense_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     #pragma HLS INLINE recursive
     if (CONFIG_T::strategy == nnet::latency) {
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
+    CONFIG_T::template kernel<data_T, res_T, CONFIG_T>::dense(data, res, weights, biases);
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
new file mode 100644
index 0000000000..97774bc95b
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_function_stubs.h
@@ -0,0 +1,51 @@
+#ifndef NNET_FUNCTION_STUBS_H_
+#define NNET_FUNCTION_STUBS_H_
+
+#include "nnet_helpers.h"
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
+  public:
+    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+                            const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+  public:
+    static void
+    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+                const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class DenseKernel {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DKernel {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index 8005682978..979c447825 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -10,6 +10,7 @@ namespace nnet {
 
 struct merge_config {
     static const unsigned n_elem = 10;
+    static const unsigned reuse_factor = 1;
 };
 
 struct dot_config {
diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl
index 4634b166f6..342b1e6740 100644
--- a/hls4ml/templates/vivado/vivado_synth.tcl
+++ b/hls4ml/templates/vivado/vivado_synth.tcl
@@ -1,6 +1,7 @@
 set tcldir [file dirname [info script]]
 source [file join $tcldir project.tcl]
 
-add_files ${project_name}_prj/solution1/syn/vhdl
+add_files ${project_name}_prj/solution1/syn/verilog
 synth_design -top ${project_name} -part $part
+opt_design -retarget -propconst -sweep -bram_power_opt -shift_register_opt
 report_utilization -file vivado_synth.rpt
diff --git a/hls4ml/utils/attribute_descriptions.py b/hls4ml/utils/attribute_descriptions.py
new file mode 100644
index 0000000000..756f276fa1
--- /dev/null
+++ b/hls4ml/utils/attribute_descriptions.py
@@ -0,0 +1,51 @@
+"""Strings holding attribute descriptions."""
+
+# Common attributes
+
+reuse_factor = (
+    'The number of times each multiplier is used by controlling the amount of pipelining/unrolling. '
+    'Lower number results in more parallelism and lower latency at the expense of the resources used.'
+    'Reuse factor = 1 corresponds to all multiplications executed in parallel, and hence, the lowest possible latency.'
+)
+
+index = 'Internal node counter used for bookkeeping and variable/tensor naming.'
+trace = 'Enables saving of layer output (tracing) when using hls_model.predict(...) or hls_model.trace(...)'
+
+result_type = 'The datatype (precision) of the output tensor.'
+accum_type = 'The datatype (precision) used to store intermediate results of the computation within the layer.'
+
+# Activation-related attributes
+
+table_size = 'The size of the lookup table used to approximate the function.'
+table_type = 'The datatype (precision) used for the values of the lookup table.'
+
+softmax_implementation = (
+    'Choice of implementation of softmax function. '
+    '"latency" provides good latency at the expense of extra resources. performs well on small number of classes. '
+    '"stable" may require extra clock cycles but has better accuracy. '
+    '"legacy" is the older implementation which has bad accuracy, but is fast and has low resource use. '
+    'It is superseded by the "latency" implementation for most applications. '
+    '"argmax" is a special implementation that can be used if only the output with the highest probability is important. '
+    'Using this implementation will save resources and clock cycles.'
+)
+softmax_skip = 'If enabled, skips the softmax node and returns the raw outputs.'
+
+# Convolution-related attributes
+
+conv_pf = (
+    'The number of outputs computed in parallel. Essentially the number of multiplications of input window with the '
+    'convolution kernel occuring in parallel. '
+    'Higher number results in more parallelism (lower latency and II) at the expense of resources used.'
+    'Currently only supported in io_parallel.'
+)
+conv_implementation = (
+    '"LineBuffer" implementation is preferred over "Encoded" for most use cases. '
+    'This attribute only applies to io_stream.'
+)
+
+# Recurrent-related attributes
+
+recurrent_static = (
+    'If set to True, will reuse the the same recurrent block for computation, resulting in lower resource '
+    'usage at the expense of serialized computation and higher latency/II.'
+)
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 5cd17d02e9..e450084095 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -284,6 +284,7 @@ def config_from_pytorch_model(
     default_reuse_factor=1,
     channels_last_conversion='full',
     transpose_outputs=True,
+    max_precision=None,
 ):
     """Create an HLS conversion config given the PyTorch model.
 
@@ -291,6 +292,15 @@ def config_from_pytorch_model(
     Users are advised to inspect the returned object to tweak the conversion configuration.
     The return object can be passed as `hls_config` parameter to `convert_from_pytorch_model`.
 
+    Note that hls4ml internally follows the keras convention for nested tensors known as
+    "channels last", wherease pytorch uses the "channels first" convention.
+    For exampe, for a tensor encoding an image with 3 channels, pytorch will expect the data
+    to be encoded as (Number_Of_Channels, Height , Width), whereas hls4ml expects
+    (Height , Width, Number_Of_Channels). By default, hls4ml will perform the necessary
+    conversions of the inputs and internal tensors automatically, but will return the output
+    in "channels last" However, this behavior can be controlled by the user using the
+    related arguments discussed below.
+
     Args:
         model: PyTorch model
         input_shape (tuple or list of tuples): The shape of the input tensor, excluding the batch size.
@@ -304,15 +314,19 @@ def config_from_pytorch_model(
             will generate config keys for every layer separately, allowing for highly specific
             configuration tweaks.
         backend(str, optional): Name of the backend to use
-        default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'.
+        default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'. Note, this must
+            be an explicit precision: 'auto' is not allowed.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
         channels_last_conversion (string, optional): Configures the conversion of pytorch layers to
-        'channels_last' dataformate. Can be set to 'full', 'internal', or 'off'. If 'full', both the inputs
-        and internal layers will be converted. If 'internal', only internal layers will be converted; this
-        assumes the inputs are converted by the user. If 'off', no conversion is performed.
+            'channels_last' data format used by hls4ml internally. Can be set to 'full' (default), 'internal',
+            or 'off'. If 'full', both the inputs and internal layers will be converted. If 'internal',
+            only internal layers will be converted; this assumes the inputs are converted by the user.
+            If 'off', no conversion is performed.
         transpose_outputs (bool, optional): Set to 'False' if the output should not be transposed from
             channels_last into channels_first data format. Defaults to 'False'. If False, outputs needs
             to be transposed manually.
+        max_precision (str or None, optional): Maximum width precision to use. Defaults to None, meaning no maximum.
+            Note:  Only integer and fixed precisions are supported
 
     Raises:
         Exception: If PyTorch model has layers not supported by hls4ml.
@@ -324,11 +338,16 @@ def config_from_pytorch_model(
     config = {}
 
     model_config = {}
-    model_config['Precision'] = default_precision
+    model_config['Precision'] = {}
+    model_config['Precision']['default'] = default_precision
+    if max_precision is not None:
+        model_config['Precision']['maximum'] = max_precision
     model_config['ReuseFactor'] = default_reuse_factor
     model_config['ChannelsLastConversion'] = channels_last_conversion
     model_config['TransposeOutputs'] = transpose_outputs
     model_config['Strategy'] = 'Latency'
+    model_config['BramFactor'] = 1_000_000_000
+    model_config['TraceOutput'] = False
 
     config['Model'] = model_config
     config['PytorchModel'] = model
@@ -372,7 +391,7 @@ def make_layer_config(layer):
                 if name.endswith('_t'):
                     name = name[:-2]
                 if attr.default is None:
-                    precision_cfg[name] = default_precision
+                    precision_cfg[name] = 'auto'
                 else:
                     precision_cfg[name] = str(attr.default)
             elif attr.name == 'reuse_factor':
@@ -413,7 +432,7 @@ def make_layer_config(layer):
 
 
 def config_from_onnx_model(
-    model, granularity='model', backend=None, default_precision='ap_fixed<16,6>', default_reuse_factor=1
+    model, granularity='name', backend=None, default_precision='fixed<16,6>', default_reuse_factor=1, max_precision=None
 ):
     """Create an HLS conversion config given the ONNX model.
 
@@ -423,8 +442,8 @@ def config_from_onnx_model(
 
     Args:
         model: ONNX model
-        granularity (str, optional): Granularity of the created config. Defaults to 'model'.
-            Can be set to 'model', 'type' and 'layer'.
+        granularity (str, optional): Granularity of the created config. Defaults to 'name'.
+            Can be set to 'model', 'type' and 'name'.
 
             Granularity can be used to generate a more verbose config that can be fine-tuned.
             The default granularity ('model') will generate config keys that apply to the whole
@@ -435,6 +454,8 @@ def config_from_onnx_model(
         backend(str, optional): Name of the backend to use
         default_precision (str, optional): Default precision to use. Defaults to 'fixed<16,6>'.
         default_reuse_factor (int, optional): Default reuse factor. Defaults to 1.
+        max_precision (str or None, optional): Maximum width precision to use. Defaults to None, meaning no maximum.
+            Note:  Only integer and fixed precisions are supported
 
     Raises:
         Exception: If ONNX model has layers not supported by hls4ml.
@@ -443,13 +464,80 @@ def config_from_onnx_model(
         [dict]: The created config.
     """
 
+    if granularity.lower() not in ['model', 'type', 'name']:
+        raise Exception(
+            f'Invalid configuration granularity specified, expected "model", "type" or "name" got "{granularity}"'
+        )
+
+    if backend is not None:
+        backend = hls4ml.backends.get_backend(backend)
+    elif granularity.lower() != 'model':
+        print('Warning:  it is recommended to pass the backend to "config_from_onnx_model"')
+
     config = {}
 
     model_config = {}
-    model_config['Precision'] = default_precision
+    model_config['Precision'] = {}
+    model_config['Precision']['default'] = default_precision
+    if max_precision is not None:
+        model_config['Precision']['maximum'] = max_precision
     model_config['ReuseFactor'] = default_reuse_factor
     model_config['Strategy'] = 'Latency'
+    model_config['BramFactor'] = 1_000_000_000
+    model_config['TraceOutput'] = False
 
     config['Model'] = model_config
 
+    layer_list, _, _ = hls4ml.converters.parse_onnx_model(model)
+
+    def make_layer_config(layer):
+        cls_name = layer['class_name']
+
+        layer_cls = hls4ml.model.layers.layer_map[cls_name]
+        if backend is not None:
+            layer_cls = backend.create_layer_class(layer_cls)
+
+        layer_config = {}
+
+        # set the default precision of the layer to auto?
+        # (not really necessary if we set the backend appropriately)
+        # layer_config['Precision'] = {'default': 'auto'}
+
+        config_attrs = [a for a in layer_cls.expected_attributes if a.configurable]
+        for attr in config_attrs:
+            if isinstance(attr, hls4ml.model.attributes.TypeAttribute):
+                precision_cfg = layer_config.setdefault('Precision', {})
+                name = attr.name
+                if name.endswith('_t'):
+                    name = name[:-2]
+                if attr.default is None:
+                    precision_cfg[name] = 'auto'
+                else:
+                    precision_cfg[name] = str(attr.default)
+            elif attr.name == 'reuse_factor':
+                layer_config[attr.config_name] = default_reuse_factor
+            else:
+                if attr.default is not None:
+                    layer_config[attr.config_name] = attr.default
+
+        return layer_config
+
+    if granularity.lower() == 'type':
+        type_config = {}
+        for layer in layer_list:
+            if layer['class_name'] in type_config:
+                continue
+            layer_config = make_layer_config(layer)
+            type_config[layer['class_name']] = layer_config
+
+        config['LayerType'] = type_config
+
+    elif granularity.lower() == 'name':
+        name_config = {}
+        for layer in layer_list:
+            layer_config = make_layer_config(layer)
+            name_config[layer['name']] = layer_config
+
+        config['LayerName'] = name_config
+
     return config
diff --git a/hls4ml/utils/string_utils.py b/hls4ml/utils/string_utils.py
index fa341cd8af..a08c4c52a7 100644
--- a/hls4ml/utils/string_utils.py
+++ b/hls4ml/utils/string_utils.py
@@ -10,7 +10,8 @@ def convert_to_snake_case(pascal_case):
     Returns:
         str: converted string
     """
-    return re.sub(r'(?<!^)(?=[A-Z])', '_', pascal_case).lower()
+    camel_case = re.sub(r'(?<!^)(?=[A-Z])', '_', pascal_case).lower()
+    return re.sub(r'_{2,}', '_', camel_case)  # Removes duplicate underscores
 
 
 def convert_to_pascal_case(snake_case):
diff --git a/hls4ml/writer/catapult_writer.py b/hls4ml/writer/catapult_writer.py
index 396ecb968e..7db1063206 100755
--- a/hls4ml/writer/catapult_writer.py
+++ b/hls4ml/writer/catapult_writer.py
@@ -676,6 +676,9 @@ def write_bridge(self, model):
                 newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
             elif 'myproject' in line:
                 newline = line.replace('myproject', format(model.config.get_project_name()))
+            elif '// hls-fpga-machine-learning insert weights dir' in line:
+                weights_dir = (Path(fout.name).parent / 'firmware/weights').resolve()
+                newline = f'static std::string s_weights_dir = "{weights_dir}";\n'
             elif '// hls-fpga-machine-learning insert bram' in line:
                 newline = line
                 for bram in model_brams:
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index e4c0c24551..0341959045 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -201,7 +201,15 @@ def write_project_cpp(self, model):
                 all_inputs = [i.name for i in model_inputs]
                 all_outputs = [o.name for o in model_outputs]
                 all_brams = [b.name for b in model_brams]
-                io_type = model.config.get_config_value("IOType")
+                io_type = model.config.get_config_value('IOType')
+
+                pipeline_style = model.config.pipeline_style
+                pipeline_ii = model.config.pipeline_ii
+                pipeline_pragma = indent + f'#pragma HLS {pipeline_style.upper()}'
+                if pipeline_style == 'pipeline' and pipeline_ii is not None:
+                    pipeline_pragma += f' II={pipeline_ii}\n'
+                else:
+                    pipeline_pragma += '\n'
 
                 if io_type == 'io_parallel':
                     for i in model_inputs:
@@ -213,17 +221,15 @@ def write_project_cpp(self, model):
                     newline += indent + '#pragma HLS INTERFACE ap_vld port={},{} \n'.format(
                         ','.join(all_inputs), ','.join(all_outputs)
                     )
-                    if model.config.pipeline_style.lower() == 'dataflow':
-                        newline += indent + '#pragma HLS DATAFLOW \n'
-                    else:
-                        newline += indent + '#pragma HLS PIPELINE \n'
+                    newline += pipeline_pragma
+
                 if io_type == 'io_stream':
                     newline += indent + '#pragma HLS INTERFACE axis port={},{} \n'.format(
                         ','.join(all_inputs), ','.join(all_outputs)
                     )
                     if all_brams:
                         newline += indent + '#pragma HLS INTERFACE bram port={} \n'.format(','.join(all_brams))
-                    newline += indent + '#pragma HLS DATAFLOW \n'
+                    newline += pipeline_pragma
 
             elif '// hls-fpga-machine-learning insert layers' in line:
                 newline = line + '\n'
@@ -711,6 +717,8 @@ def write_build_script(self, model):
             f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
             f.write('variable version\n')
             f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+            f.write('variable maximum_size\n')
+            f.write('set maximum_size {}\n'.format(model.config.get_config_value('MaximumSize', '4096')))
 
         # build_prj.tcl
         srcpath = (filedir / '../templates/vivado/build_prj.tcl').resolve()
@@ -831,7 +839,7 @@ def write_tar(self, model):
             if os.path.exists(tar_path):
                 os.remove(tar_path)
             with tarfile.open(tar_path, mode='w:gz') as archive:
-                archive.add(model.config.get_output_dir(), recursive=True)
+                archive.add(model.config.get_output_dir(), recursive=True, arcname='')
 
     def write_hls(self, model):
         print('Writing HLS project')
diff --git a/setup.cfg b/setup.cfg
index 9b7ef45f8f..0b81e7b592 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,12 +27,12 @@ install_requires =
     numpy
     onnx>=1.4.0
     pydigitalwavetools==1.1
+    pyparsing
     pyyaml
-    qkeras
     tabulate
-    tensorflow
+    tensorflow>=2.8.0,<=2.14.1
     tensorflow-model-optimization<=0.7.5
-python_requires = >=3.10
+python_requires = >=3.10, <3.12
 include_package_data = True
 scripts = scripts/hls4ml
 
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index b130b43cef..adc3d680ab 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -18,13 +18,14 @@
     EXAMPLEMODEL: {}
 """
 
+
 n_test_files_per_yml = int(os.environ.get('N_TESTS_PER_YAML', 4))
 
 # Blacklisted tests will be skipped
 BLACKLIST = {'test_reduction'}
 
 # Long-running tests will not be bundled with other tests
-LONGLIST = {'test_hgq_layers'}
+LONGLIST = {'test_hgq_layers', 'test_hgq_players', 'test_qkeras', 'test_pytorch_api'}
 
 
 def path_to_name(test_path):
@@ -71,7 +72,7 @@ def generate_test_yaml(test_root='.'):
         name = path.stem.replace('test_', '')
         test_file = str(path.relative_to(test_root))
         needs_examples = uses_example_model(path)
-        diff_yml = yaml.safe_load(template.format(name, test_file, needs_examples))
+        diff_yml = yaml.safe_load(template.format(name, test_file, int(needs_examples)))
         yml.update(diff_yml)
 
     return yml
diff --git a/test/pytest/test_activations.py b/test/pytest/test_activations.py
index 5d97067c4a..d1ccba512c 100644
--- a/test/pytest/test_activations.py
+++ b/test/pytest/test_activations.py
@@ -19,6 +19,7 @@
     [
         (ReLU(), 'relu'),
         (LeakyReLU(alpha=1.5), 'leaky_relu'),
+        (Activation('leaky_relu'), 'leaky_relu_act'),
         (ThresholdedReLU(theta=0.75), 'threshold_relu'),
         (ELU(alpha=1.25), 'elu'),
         (Activation('selu'), 'selu'),
diff --git a/test/pytest/test_dense_unrolled.py b/test/pytest/test_dense_unrolled.py
new file mode 100644
index 0000000000..5d3e8f4acb
--- /dev/null
+++ b/test/pytest/test_dense_unrolled.py
@@ -0,0 +1,147 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+from tensorflow.keras.layers import GRU, LSTM, Conv1D, Conv2D, Dense, Flatten
+from tensorflow.keras.models import Sequential
+
+from hls4ml.converters import convert_from_keras_model
+from hls4ml.utils import config_from_keras_model
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.mark.parametrize('strategy', ['ResourceUnrolled', 'resource_unrolled', 'Resource_Unrolled'])
+def test_resource_unrolled_parsing(strategy):
+    model = Sequential()
+    model.add(
+        Dense(8, input_shape=(16,), kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform', name='dense')
+    )
+    model.compile('adam', 'mse')
+
+    config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', backend='Vitis', default_reuse_factor=8)
+    config['Model']['Strategy'] = strategy
+
+    output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_parsing_{strategy}')
+    hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vitis')
+
+    # Check if strategy was not overridden
+    assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled'
+
+
+# Tests a wide range of RF to ensure the unrolled resource kernel is correct
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize('reuse_factor', [1, 2, 4, 8, 16, 32, 48, 64, 96, 192])
+@pytest.mark.parametrize('backend', ['Vitis', 'Vivado'])
+def test_resource_unrolled_dense(io_type, reuse_factor, backend):
+    input_shape = (16,)
+    X = np.random.rand(100, *input_shape)
+
+    model = Sequential()
+    model.add(
+        Dense(
+            12, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform', name='dense'
+        )
+    )
+    model.compile('adam', 'mse')
+    keras_prediction = model.predict(X)
+
+    config = config_from_keras_model(
+        model, default_precision='ac_fixed<32, 16>', backend=backend, default_reuse_factor=reuse_factor
+    )
+    config['Model']['Strategy'] = 'ResourceUnrolled'
+
+    output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_dense_{io_type}_{reuse_factor}_{backend}')
+    hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type)
+
+    # Check if strategy was not overridden
+    assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency'
+
+    hls_model.compile()
+
+    hls_prediction = hls_model.predict(X)
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2)
+
+
+# Tests a wide range RF on streaming Conv1D/2D to ensure the unrolled resource kernel is correct
+@pytest.mark.parametrize('dim', [1, 2])
+@pytest.mark.parametrize('io_type', ['io_stream'])
+@pytest.mark.parametrize('reuse_factor', [1, 3, 9, 27, 54, 108])
+def test_resource_unrolled_streaming_conv(dim, io_type, reuse_factor):
+    input_shape = (8,) * dim + (3,)
+    X = np.random.rand(100, *input_shape)
+    conv_class = Conv1D if dim == 1 else Conv2D
+
+    model = Sequential()
+    model.add(
+        conv_class(
+            4, (3,) * dim, input_shape=input_shape, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform'
+        )
+    )
+    model.add(Flatten())
+    model.add(Dense(1, kernel_initializer='lecun_uniform', bias_initializer='lecun_uniform'))
+    model.compile('adam', 'mse')
+    keras_prediction = model.predict(X)
+
+    config = config_from_keras_model(model, default_precision='ac_fixed<32, 16>', default_reuse_factor=reuse_factor)
+    config['Model']['Strategy'] = 'ResourceUnrolled'
+
+    output_dir = str(test_root_path / f'hls4mlprj_resource_unrolled_conv{dim}d_{io_type}_{reuse_factor}')
+    hls_model = convert_from_keras_model(model, hls_config=config, output_dir=output_dir, backend='Vivado', io_type=io_type)
+
+    # Check if strategy was not overridden
+    assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency'
+
+    hls_model.compile()
+
+    hls_prediction = hls_model.predict(X)
+    np.testing.assert_allclose(hls_prediction, keras_prediction, rtol=0, atol=1e-2)
+
+
+@pytest.mark.parametrize('rnn_layer', [LSTM, GRU])
+@pytest.mark.parametrize('backend', ['Vitis', 'Vivado'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize('static', [True, False])
+@pytest.mark.parametrize('reuse_factor', [1, 4, 32, 128])  # RF=128 also tests if setting closest RF works well
+def test_resource_unrolled_rnn(rnn_layer, backend, io_type, static, reuse_factor):
+    # Subtract 0.5 to include negative values
+    input_shape = (12, 8)
+    X = np.random.rand(50, *input_shape) - 0.5
+
+    layer_name = rnn_layer.__name__.lower()
+    keras_model = Sequential()
+    keras_model.add(
+        rnn_layer(
+            units=8,
+            input_shape=input_shape,
+            kernel_initializer='lecun_uniform',
+            recurrent_initializer='lecun_uniform',
+            bias_initializer='lecun_uniform',
+            return_sequences=False,
+            name=layer_name,
+        )
+    )
+    keras_model.compile()
+
+    default_precision = 'ap_fixed<32, 16>' if backend in ['Vivado', 'Vitis'] else 'ac_fixed<32, 16, true>'
+    hls_config = config_from_keras_model(
+        keras_model, granularity='name', default_precision=default_precision, backend=backend
+    )
+    hls_config['LayerName'][layer_name]['static'] = static
+    hls_config['LayerName'][layer_name]['Strategy'] = 'ResourceUnrolled'
+    hls_config['LayerName'][layer_name]['ReuseFactor'] = reuse_factor
+    prj_name = f'hls4mlprj_resource_unrolled_rnn_{layer_name}_static_{int(static)}_{io_type}_{reuse_factor}_{backend}'
+    output_dir = str(test_root_path / prj_name)
+
+    hls_model = convert_from_keras_model(
+        keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+
+    # Check if strategy was not overridden
+    assert list(hls_model.get_layers())[1].get_attr('strategy') == 'resource_unrolled' if reuse_factor > 1 else 'latency'
+
+    hls_model.compile()
+
+    keras_prediction = keras_model.predict(X)
+    hls_prediction = hls_model.predict(X)
+    np.testing.assert_allclose(hls_prediction.flatten(), keras_prediction.flatten(), rtol=0.0, atol=5e-2)
diff --git a/test/pytest/test_depthconv1d.py b/test/pytest/test_depthconv1d.py
index 3734815af0..85c8e2ac4f 100644
--- a/test/pytest/test_depthconv1d.py
+++ b/test/pytest/test_depthconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_depthconv2d.py b/test/pytest/test_depthconv2d.py
index 9178edf368..4832cb1ae9 100644
--- a/test/pytest/test_depthconv2d.py
+++ b/test/pytest/test_depthconv2d.py
@@ -24,6 +24,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_hgq_layers.py b/test/pytest/test_hgq_layers.py
index 92a7ea1876..80d96fbcda 100644
--- a/test/pytest/test_hgq_layers.py
+++ b/test/pytest/test_hgq_layers.py
@@ -19,7 +19,6 @@
     Signature,
 )
 from HGQ.proxy import to_proxy_model
-from HGQ.proxy.fixed_point_quantizer import gfixed
 from tensorflow import keras
 
 from hls4ml.converters import convert_from_keras_model
@@ -79,51 +78,6 @@ def run_model_test(
     _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
 
 
-def create_player_model(layer: str, rnd_strategy: str, io_type: str):
-    pa_config = get_default_paq_conf()
-    pa_config['rnd_strategy'] = rnd_strategy
-    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
-    set_default_paq_conf(pa_config)
-
-    inp = keras.Input(shape=(15))
-    if 'PConcatenate' in layer:
-        _inp = [HQuantize()(inp)] * 2
-        out = eval(layer)(_inp)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    elif 'Signature' in layer:
-        _inp = eval(layer)(inp)
-        out = HDense(15)(_inp)
-        return keras.Model(inp, out)
-    elif 'Pool2D' in layer:
-        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
-    elif 'Pool1D' in layer:
-        _inp = PReshape((5, 3))(HQuantize()(inp))
-    elif 'Dense' in layer or 'Activation' in layer:
-        _inp = HQuantize()(inp)
-    elif 'Flatten' in layer:
-        out = HQuantize()(inp)
-        out = PReshape((3, 5))(out)
-        out = HConv1D(2, 2)(out)
-        out = eval(layer)(out)
-        out = HDense(15)(out)
-        return keras.Model(inp, out)
-    else:
-        raise Exception(f'Please add test for {layer}')
-
-    out = eval(layer)(_inp)
-    model = keras.Model(inp, out)
-
-    for layer in model.layers:
-        # No weight bitwidths to randomize
-        # And activation bitwidths
-        if hasattr(layer, 'paq'):
-            fbw: tf.Variable = layer.paq.fbw
-            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
-
-    return model
-
-
 def create_hlayer_model(layer: str, rnd_strategy: str, io_type: str):
     pa_config = get_default_paq_conf()
     pa_config['rnd_strategy'] = rnd_strategy
@@ -222,43 +176,3 @@ def test_syn_hlayers(layer, N: int, rnd_strategy: str, io_type: str, cover_facto
     path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
 
     run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive, cond=cond)
-
-
-@pytest.mark.parametrize(
-    'layer',
-    [
-        "PConcatenate()",
-        "PMaxPool1D(2, padding='same')",
-        "PMaxPool1D(4, padding='same')",
-        "PMaxPool2D((5,3), padding='same')",
-        "PMaxPool1D(2, padding='valid')",
-        "PMaxPool2D((2,3), padding='valid')",
-        "Signature(1,6,3)",
-        "PAvgPool1D(2, padding='same')",
-        "PAvgPool2D((1,2), padding='same')",
-        "PAvgPool2D((2,2), padding='same')",
-        "PAvgPool1D(2, padding='valid')",
-        "PAvgPool2D((1,2), padding='valid')",
-        "PAvgPool2D((2,2), padding='valid')",
-        "PFlatten()",
-    ],
-)
-@pytest.mark.parametrize("N", [1000])
-@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
-@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
-@pytest.mark.parametrize("cover_factor", [1.0])
-@pytest.mark.parametrize("aggressive", [True, False])
-@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
-def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
-    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
-    data = get_data((N, 15), 7, 1)
-
-    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
-
-    if 'Signature' in layer:
-        q = gfixed(1, 6, 3)
-        data = q(data).numpy()
-    if "padding='same'" in layer and io_type == 'io_stream':
-        pytest.skip("io_stream does not support padding='same' for pools at the moment")
-
-    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)
diff --git a/test/pytest/test_hgq_players.py b/test/pytest/test_hgq_players.py
new file mode 100644
index 0000000000..9c4b40f97f
--- /dev/null
+++ b/test/pytest/test_hgq_players.py
@@ -0,0 +1,171 @@
+from pathlib import Path
+
+import HGQ  # noqa: F401
+import numpy as np
+import pytest
+import tensorflow as tf
+from HGQ import get_default_paq_conf, set_default_paq_conf, trace_minmax
+from HGQ.layers import (  # noqa: F401
+    HConv1D,
+    HDense,
+    HQuantize,
+    PAvgPool1D,
+    PAvgPool2D,
+    PConcatenate,
+    PFlatten,
+    PMaxPool1D,
+    PMaxPool2D,
+    PReshape,
+    Signature,
+)
+from HGQ.proxy import to_proxy_model
+from HGQ.proxy.fixed_point_quantizer import gfixed
+from tensorflow import keras
+
+from hls4ml.converters import convert_from_keras_model
+
+# tf.config.experimental_run_functions_eagerly(True)  # noqa
+
+
+test_path = Path(__file__).parent
+
+
+def _run_synth_match_test(proxy: keras.Model, data, io_type: str, backend: str, dir: str, cond=None):
+
+    output_dir = dir + '/hls4ml_prj'
+    hls_model = convert_from_keras_model(
+        proxy,
+        io_type=io_type,
+        output_dir=output_dir,
+        backend=backend,
+        hls_config={'Model': {'Precision': 'fixed<1,0>', 'ReuseFactor': 1}},
+    )
+    hls_model.compile()
+
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    # Multiple output case. Check each output separately
+    if len(proxy.outputs) > 1:  # type: ignore
+        r_proxy: list[np.ndarray] = [x.numpy() for x in proxy(data)]  # type: ignore
+        r_hls: list[np.ndarray] = hls_model.predict(data)  # type: ignore
+        r_hls = [x.reshape(r_proxy[i].shape) for i, x in enumerate(r_hls)]
+    else:
+        r_proxy: list[np.ndarray] = [proxy(data).numpy()]  # type: ignore
+        r_hls: list[np.ndarray] = [hls_model.predict(data).reshape(r_proxy[0].shape)]  # type: ignore
+
+    errors = []
+    for i, (p, h) in enumerate(zip(r_proxy, r_hls)):
+        try:
+            if cond is None:
+                mismatch_ph = p != h
+                assert (
+                    np.sum(mismatch_ph) == 0
+                ), f"Proxy-HLS4ML mismatch for out {i}: {np.sum(np.any(mismatch_ph, axis=1))} out of {data_len} samples are different. Sample: {p[mismatch_ph].ravel()[:5]} vs {h[mismatch_ph].ravel()[:5]}"  # noqa: E501
+            else:
+                cond(p, h)
+        except AssertionError as e:
+            errors.append(e)
+    if len(errors) > 0:
+        msgs = [str(e) for e in errors]
+        raise AssertionError('\n'.join(msgs))
+
+
+def run_model_test(
+    model: keras.Model, cover_factor: float | None, data, io_type: str, backend: str, dir: str, aggressive: bool, cond=None
+):
+    data_len = data.shape[0] if isinstance(data, np.ndarray) else data[0].shape[0]
+    if cover_factor is not None:
+        trace_minmax(model, data, cover_factor=cover_factor, bsz=data_len)
+    proxy = to_proxy_model(model, aggressive=aggressive, unary_lut_max_table_size=4096)
+    _run_synth_match_test(proxy, data, io_type, backend, dir, cond=cond)
+
+
+def create_player_model(layer: str, rnd_strategy: str, io_type: str):
+    pa_config = get_default_paq_conf()
+    pa_config['rnd_strategy'] = rnd_strategy
+    pa_config['skip_dims'] = 'all' if io_type == 'io_stream' else 'batch'
+    set_default_paq_conf(pa_config)
+
+    inp = keras.Input(shape=(15))
+    if 'PConcatenate' in layer:
+        _inp = [HQuantize()(inp)] * 2
+        out = eval(layer)(_inp)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    elif 'Signature' in layer:
+        _inp = eval(layer)(inp)
+        out = HDense(15)(_inp)
+        return keras.Model(inp, out)
+    elif 'Pool2D' in layer:
+        _inp = PReshape((3, 5, 1))(HQuantize()(inp))
+    elif 'Pool1D' in layer:
+        _inp = PReshape((5, 3))(HQuantize()(inp))
+    elif 'Dense' in layer or 'Activation' in layer:
+        _inp = HQuantize()(inp)
+    elif 'Flatten' in layer:
+        out = HQuantize()(inp)
+        out = PReshape((3, 5))(out)
+        out = HConv1D(2, 2)(out)
+        out = eval(layer)(out)
+        out = HDense(15)(out)
+        return keras.Model(inp, out)
+    else:
+        raise Exception(f'Please add test for {layer}')
+
+    out = eval(layer)(_inp)
+    model = keras.Model(inp, out)
+
+    for layer in model.layers:
+        # No weight bitwidths to randomize
+        # And activation bitwidths
+        if hasattr(layer, 'paq'):
+            fbw: tf.Variable = layer.paq.fbw
+            fbw.assign(tf.constant(np.random.uniform(4, 6, fbw.shape).astype(np.float32)))
+
+    return model
+
+
+def get_data(shape: tuple[int, ...], v: float, max_scale: float):
+    rng = np.random.default_rng()
+    a1 = rng.uniform(-v, v, shape).astype(np.float32)
+    a2 = rng.uniform(0, max_scale, (1, shape[1])).astype(np.float32)
+    return (a1 * a2).astype(np.float32)
+
+
+@pytest.mark.parametrize(
+    'layer',
+    [
+        "PConcatenate()",
+        "PMaxPool1D(2, padding='same')",
+        "PMaxPool1D(4, padding='same')",
+        "PMaxPool2D((5,3), padding='same')",
+        "PMaxPool1D(2, padding='valid')",
+        "PMaxPool2D((2,3), padding='valid')",
+        "Signature(1,6,3)",
+        "PAvgPool1D(2, padding='same')",
+        "PAvgPool2D((1,2), padding='same')",
+        "PAvgPool2D((2,2), padding='same')",
+        "PAvgPool1D(2, padding='valid')",
+        "PAvgPool2D((1,2), padding='valid')",
+        "PAvgPool2D((2,2), padding='valid')",
+        "PFlatten()",
+    ],
+)
+@pytest.mark.parametrize("N", [1000])
+@pytest.mark.parametrize("rnd_strategy", ['floor', 'standard_round'])
+@pytest.mark.parametrize("io_type", ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize("cover_factor", [1.0])
+@pytest.mark.parametrize("aggressive", [True, False])
+@pytest.mark.parametrize("backend", ['vivado', 'vitis'])
+def test_syn_players(layer, N: int, rnd_strategy: str, io_type: str, cover_factor: float, aggressive: bool, backend: str):
+    model = create_player_model(layer=layer, rnd_strategy=rnd_strategy, io_type=io_type)
+    data = get_data((N, 15), 7, 1)
+
+    path = test_path / f'hls4mlprj_hgq_{layer}_{rnd_strategy}_{io_type}_{aggressive}_{backend}'
+
+    if 'Signature' in layer:
+        q = gfixed(1, 6, 3)
+        data = q(data).numpy()
+    if "padding='same'" in layer and io_type == 'io_stream':
+        pytest.skip("io_stream does not support padding='same' for pools at the moment")
+
+    run_model_test(model, cover_factor, data, io_type, backend, str(path), aggressive)
diff --git a/test/pytest/test_keras_api.py b/test/pytest/test_keras_api.py
index af2507e8f7..4bb9f03751 100644
--- a/test/pytest/test_keras_api.py
+++ b/test/pytest/test_keras_api.py
@@ -119,9 +119,19 @@ def test_activations(activation_function, backend, io_type):
 
 
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize(
+    'backend,strategy',
+    [
+        ('Vivado', 'Resource'),
+        ('Vivado', 'Latency'),
+        ('Vitis', 'Resource'),
+        ('Vitis', 'Latency'),
+        ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
+    ],
+)
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-def test_conv1d(padds, backend, io_type):
+def test_conv1d(padds, backend, strategy, io_type):
     model = tf.keras.models.Sequential()
     input_shape = (10, 128, 4)
     model.add(
@@ -144,7 +154,8 @@ def test_conv1d(padds, backend, io_type):
     keras_prediction = model.predict(X_input)
 
     config = hls4ml.utils.config_from_keras_model(model)
-    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv1d_{padds}_{backend}_{io_type}')
+    config['Model']['Strategy'] = strategy
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv1d_{padds}_{backend}_{strategy}_{io_type}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
     )
@@ -192,9 +203,19 @@ def test_conv1d(padds, backend, io_type):
 
 @pytest.mark.parametrize('chans', chans_options)
 @pytest.mark.parametrize('padds', padds_options)
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus', 'oneAPI'])
+@pytest.mark.parametrize(
+    'backend,strategy',
+    [
+        ('Vivado', 'Resource'),
+        ('Vivado', 'Latency'),
+        ('Vitis', 'Resource'),
+        ('Vitis', 'Latency'),
+        ('Quartus', 'Resource'),
+        ('oneAPI', 'Resource'),
+    ],
+)
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-def test_conv2d(chans, padds, backend, io_type):
+def test_conv2d(chans, padds, backend, strategy, io_type):
     model = tf.keras.models.Sequential()
     input_shape = (28, 28, 3)
     model.add(
@@ -215,7 +236,8 @@ def test_conv2d(chans, padds, backend, io_type):
     keras_prediction = model.predict(X_input)
 
     config = hls4ml.utils.config_from_keras_model(model)
-    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv2d_{backend}_{chans}_{padds}_{io_type}')
+    config['Model']['Strategy'] = strategy
+    output_dir = str(test_root_path / f'hls4mlprj_keras_api_conv2d_{backend}_{strategy}_{chans}_{padds}_{io_type}')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, backend=backend, io_type=io_type
     )
diff --git a/test/pytest/test_multiout_network.py b/test/pytest/test_multiout_network.py
index 15e23ff79a..366fac7fb5 100644
--- a/test/pytest/test_multiout_network.py
+++ b/test/pytest/test_multiout_network.py
@@ -19,6 +19,21 @@ def model():
     return model
 
 
+@pytest.fixture(scope='module')
+def model_corner_cases():
+    in1 = keras.layers.Input(shape=(24, 8))
+    in2 = keras.layers.Input(shape=(16))
+    out1 = keras.layers.Conv1D(1, 3)(in1)
+    out1 = keras.layers.Flatten()(out1)
+    out2 = keras.layers.Dense(16, activation='relu')(out1)
+    out2 = keras.layers.Add()([out2, in2])
+    out3 = keras.layers.Dense(2)(out1)
+    out4 = keras.layers.Dense(2)(out2)
+    out4 = keras.layers.Flatten()(out4)
+    model = keras.models.Model(inputs=[in1, in2], outputs=[out1, out2, out3, out4])
+    return model
+
+
 @pytest.fixture(scope='module')
 def data():
     X = np.random.normal(0, 1, (1000, 10))
@@ -26,18 +41,20 @@ def data():
     return X
 
 
+@pytest.fixture(scope='module')
+def data_corner_cases():
+    X1 = np.random.normal(0, 1, (1000, 24, 8))
+    X2 = np.random.normal(0, 1, (1000, 16))
+    X1 = np.clip(X1, -16, 15)
+    X2 = np.clip(X2, -16, 15)
+    return X1, X2
+
+
 @pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis'])
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-def test_multi_clone(model, data, backend: str, io_type: str):
+def test_multi_output_nn(model, data, backend: str, io_type: str):
     output_dir = str(test_root_path / f'hls4mlprj_multiout_network_{backend}_{io_type}')
     hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}}
-    layer_config = {
-        'dense1': {'Precision': {'result': 'fixed<35,5>'}},
-        'dense2': {'Precision': {'result': 'fixed<40,5>'}},
-        'dense1_linear': {'Precision': {'result': 'fixed<35,5>'}},
-        'dense2_linear': {'Precision': {'result': 'fixed<40,5>'}},
-    }
-    hls_config['LayerName'] = layer_config
     model_hls = convert_from_keras_model(
         model, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
     )
@@ -50,3 +67,32 @@ def test_multi_clone(model, data, backend: str, io_type: str):
 
     assert np.allclose(r_hls[0], r_keras[0], atol=1e-5, rtol=0)
     assert np.allclose(r_hls[1], r_keras[1], atol=1e-5, rtol=0)
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus', 'Vitis', 'Catapult', 'OneAPI'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize('strategy', ['latency', 'resource'])
+def test_multi_output_nn_corner_cases(model_corner_cases, data_corner_cases, backend: str, io_type: str, strategy: str):
+    """Cover corner cases, when:
+    - a layer outputs both to the next layer(s) and to the model output
+       - when an node removal/insertion is triggered internally
+    - a reshape in io_parallel, or flatten in io_stream layer's output is used multiple times
+       - and as layer output
+       - and by layer taking multiple inputs
+    - a Flatten layer outputs to the model output in io_stream
+    """
+    output_dir = str(test_root_path / f'hls4mlprj_multiout_network_2_{backend}_{io_type}_{strategy}')
+    hls_config = {'Model': {'Precision': 'fixed<32,5>', 'ReuseFactor': 1}, 'Strategy': strategy}
+
+    model_hls = convert_from_keras_model(
+        model_corner_cases, backend=backend, output_dir=output_dir, hls_config=hls_config, io_type=io_type
+    )
+
+    model_hls.compile()
+    r_hls = model_hls.predict(data_corner_cases)
+    r_keras = model_corner_cases.predict(data_corner_cases, verbose=0, batch_size=1000)
+
+    assert np.allclose(r_hls[0], r_keras[0], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[1], r_keras[1], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[2], r_keras[2], atol=1e-5, rtol=0)
+    assert np.allclose(r_hls[3], r_keras[3], atol=1e-5, rtol=0)
diff --git a/test/pytest/test_optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py
index 3ba8d08d14..a42d3a6751 100644
--- a/test/pytest/test_optimization/test_attributes.py
+++ b/test/pytest/test_optimization/test_attributes.py
@@ -1,7 +1,7 @@
 from tensorflow.keras.layers import Conv2D, Dense, Flatten, ReLU
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.optimization import get_attributes_from_keras_model_and_hls4ml_config
 from hls4ml.utils.config import config_from_keras_model
 
 
diff --git a/test/pytest/test_optimization/test_keras/test_masking.py b/test/pytest/test_optimization/test_keras/test_masking.py
index 5c5e60aca7..8b465d8d7e 100644
--- a/test/pytest/test_optimization/test_keras/test_masking.py
+++ b/test/pytest/test_optimization/test_keras/test_masking.py
@@ -4,10 +4,10 @@
 from tensorflow.keras.layers import Conv2D, Dense, Flatten
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.attributes import get_attributes_from_keras_model
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.masking import get_model_masks
-from hls4ml.optimization.objectives import ParameterEstimator
+from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks
+from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator
 
 '''
 In all the tests, an artifical network with one Dense/Conv2D layer and pre-determined weights is created
diff --git a/test/pytest/test_optimization/test_keras/test_reduction.py b/test/pytest/test_optimization/test_keras/test_reduction.py
index 7243a9123f..4bf93f7301 100644
--- a/test/pytest/test_optimization/test_keras/test_reduction.py
+++ b/test/pytest/test_optimization/test_keras/test_reduction.py
@@ -6,8 +6,8 @@
 from tensorflow.keras.layers import AveragePooling2D, BatchNormalization, Conv2D, Dense, Flatten, MaxPooling2D, ReLU, Softmax
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.keras.reduction import reduce_model
-from hls4ml.optimization.keras.utils import get_model_sparsity
+from hls4ml.optimization.dsp_aware_pruning.keras.reduction import reduce_model
+from hls4ml.optimization.dsp_aware_pruning.keras.utils import get_model_sparsity
 
 pytest.skip(allow_module_level=True)
 
diff --git a/test/pytest/test_optimization/test_keras/test_regularizers.py b/test/pytest/test_optimization/test_keras/test_regularizers.py
index 9fe518caae..f643f3a79a 100644
--- a/test/pytest/test_optimization/test_keras/test_regularizers.py
+++ b/test/pytest/test_optimization/test_keras/test_regularizers.py
@@ -6,9 +6,9 @@
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.optimizers import Adam
 
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.builder import remove_custom_regularizers
-from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.builder import remove_custom_regularizers
+from hls4ml.optimization.dsp_aware_pruning.keras.regularizers import Conv2DRegularizer, DenseRegularizer
 
 # Constants
 pattern_offset = 4
diff --git a/test/pytest/test_optimization/test_keras/test_weight_sharing.py b/test/pytest/test_optimization/test_keras/test_weight_sharing.py
index c274a84da8..be1d3a957f 100644
--- a/test/pytest/test_optimization/test_keras/test_weight_sharing.py
+++ b/test/pytest/test_optimization/test_keras/test_weight_sharing.py
@@ -4,10 +4,10 @@
 from tensorflow.keras.layers import Dense
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.attributes import get_attributes_from_keras_model
-from hls4ml.optimization.config import SUPPORTED_STRUCTURES
-from hls4ml.optimization.keras.masking import get_model_masks
-from hls4ml.optimization.objectives import ObjectiveEstimator
+from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.dsp_aware_pruning.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.dsp_aware_pruning.keras.masking import get_model_masks
+from hls4ml.optimization.dsp_aware_pruning.objectives import ObjectiveEstimator
 
 # Similar tests in test_masking.py, weight sharing instead of pruning
 sparsity = 0.33
diff --git a/test/pytest/test_optimization/test_knapsack.py b/test/pytest/test_optimization/test_knapsack.py
index a4145c00d0..804081c8e8 100644
--- a/test/pytest/test_optimization/test_knapsack.py
+++ b/test/pytest/test_optimization/test_knapsack.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from hls4ml.optimization.knapsack import solve_knapsack
+from hls4ml.optimization.dsp_aware_pruning.knapsack import solve_knapsack
 
 
 # In the simple case below, both implementations give the optimal answer
diff --git a/test/pytest/test_optimization/test_objectives.py b/test/pytest/test_optimization/test_objectives.py
index a7d81befe6..2f8a6414da 100644
--- a/test/pytest/test_optimization/test_objectives.py
+++ b/test/pytest/test_optimization/test_objectives.py
@@ -2,8 +2,8 @@
 from tensorflow.keras.layers import Conv2D, Dense, Flatten
 from tensorflow.keras.models import Sequential
 
-from hls4ml.optimization.attributes import get_attributes_from_keras_model
-from hls4ml.optimization.objectives import ParameterEstimator
+from hls4ml.optimization.dsp_aware_pruning.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.dsp_aware_pruning.objectives import ParameterEstimator
 
 
 # Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc.
diff --git a/test/pytest/test_optimization/test_scheduler.py b/test/pytest/test_optimization/test_scheduler.py
index 2dc7642bf6..2182d1cb46 100644
--- a/test/pytest/test_optimization/test_scheduler.py
+++ b/test/pytest/test_optimization/test_scheduler.py
@@ -1,6 +1,6 @@
 import numpy as np  # Use np.testing.assert_allclose due to floating point rounding errors
 
-from hls4ml.optimization.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler
+from hls4ml.optimization.dsp_aware_pruning.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler
 
 
 def test_constant_scheduler():
diff --git a/test/pytest/test_pipeline_style.py b/test/pytest/test_pipeline_style.py
new file mode 100755
index 0000000000..17d180d487
--- /dev/null
+++ b/test/pytest/test_pipeline_style.py
@@ -0,0 +1,99 @@
+""" Test that pipeline style is properly handled by optimizers (respected if user-defined, correctly set if 'auto'). """
+
+from pathlib import Path
+
+import pytest
+import tensorflow as tf
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize(
+    'param_group, pipeline_style, io_type, strategy, ii',
+    [
+        (1, 'auto', 'io_stream', 'resource', None),  # io_stream should result in DATAFLOW pragma regardless of other params
+        (2, 'auto', 'io_stream', 'latency', None),
+        (3, None, 'io_stream', 'resource_unrolled', None),  # None should be interpreted as 'auto'
+        (4, 'auto', 'io_parallel', 'resource', None),  # Should end up with DATAFLOW pragma
+        (5, 'auto', 'io_parallel', 'latency', None),  # Should end up with PIPELINE pragma
+        (6, 'auto', 'io_parallel', 'resource_unrolled', None),  # Should end up with PIPELINE pragma and II
+        (7, 'pipeline', 'io_stream', 'resource', None),  # Should result in a warning
+        (8, 'pipeline', 'io_parallel', 'resource', None),  # Should result in a warning
+        (9, 'pipeline', 'io_parallel', 'latency', None),  # No warning
+        (10, 'pipeline', 'io_parallel', 'latency', 10),  # No warning, should include II=10
+        (11, 'dataflow', 'io_stream', 'latency', None),  # No warning
+        (12, 'dataflow', 'io_parallel', 'latency', None),  # No warning
+        (13, 'dataflow', 'io_parallel', 'latency', None),  # No warning
+        (14, 'wrong', 'io_parallel', 'latency', None),  # Incorrect settings should issue a warning and switch to 'auto'
+        (15, 'auto', 'io_parallel', 'resource', None),  # Special case to test Conv layer. No warning
+        (16, 'pipeline', 'io_parallel', 'resource', None),  # Special case to test Conv layer. Should result in two warnings
+    ],
+)
+def test_pipeline_style(capfd, backend, param_group, pipeline_style, io_type, strategy, ii):
+    def _check_top_hls_pragma(model, pragma, ii=None):
+        assert model.config.pipeline_style == pragma
+
+        pragma_to_check = f'#pragma HLS {pragma.upper()}'
+        if ii is not None:
+            pragma_to_check += f' II={ii}'
+
+        with open(model.config.get_output_dir() + '/firmware/myproject.cpp') as main_file:
+            contents = main_file.readlines()
+            for line in contents:
+                if pragma_to_check in line:
+                    return True
+
+        return False
+
+    if param_group in [15, 16]:
+        model = tf.keras.models.Sequential([tf.keras.layers.Conv1D(8, 2, input_shape=(10, 4))])
+    else:
+        model = tf.keras.models.Sequential([tf.keras.layers.Dense(8, input_shape=(10,))])
+
+    config = hls4ml.utils.config_from_keras_model(model)
+    if pipeline_style is not None:
+        config['Model']['PipelineStyle'] = pipeline_style
+    if ii is not None:
+        config['Model']['PipelineInterval'] = ii
+    config['Model']['Strategy'] = strategy
+    config['Model']['ReuseFactor'] = 2
+
+    prj_name = f'hls4mlprj_pipeline_style_{backend}_{param_group}'
+    output_dir = str(test_root_path / prj_name)
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
+    )
+    hls_model.write()
+
+    captured_warnings = [line for line in capfd.readouterr().out.split('\n') if line.startswith('WARNING')]
+
+    if param_group in [1, 2, 3, 4]:
+        assert _check_top_hls_pragma(hls_model, 'dataflow')
+    elif param_group == 5:
+        assert _check_top_hls_pragma(hls_model, 'pipeline')
+    elif param_group == 6:
+        assert _check_top_hls_pragma(hls_model, 'pipeline', ii=2)
+    elif param_group in [7, 8]:
+        assert _check_top_hls_pragma(hls_model, 'pipeline')
+        assert any('bad QoR' in warning for warning in captured_warnings)
+    elif param_group == 9:
+        assert _check_top_hls_pragma(hls_model, 'pipeline')
+        assert len(captured_warnings) == 0
+    elif param_group == 10:
+        assert _check_top_hls_pragma(hls_model, 'pipeline', ii=ii)
+        assert len(captured_warnings) == 0
+    elif param_group in [11, 12, 13]:
+        assert _check_top_hls_pragma(hls_model, 'dataflow')
+        assert len(captured_warnings) == 0
+    elif param_group == 14:
+        assert _check_top_hls_pragma(hls_model, 'pipeline')
+        assert any('Using "auto"' in warning for warning in captured_warnings)
+    elif param_group == 15:
+        assert _check_top_hls_pragma(hls_model, 'dataflow')
+    elif param_group == 16:
+        assert _check_top_hls_pragma(hls_model, 'pipeline')
+        assert any('bad QoR' in warning for warning in captured_warnings)
+        assert any('Convolution' in warning for warning in captured_warnings)
diff --git a/test/pytest/test_pointwiseconv.py b/test/pytest/test_pointwiseconv.py
index 678b22bfeb..1cfb43e4cd 100644
--- a/test/pytest/test_pointwiseconv.py
+++ b/test/pytest/test_pointwiseconv.py
@@ -19,25 +19,27 @@
 @pytest.mark.parametrize('padds', padds_options)
 @pytest.mark.parametrize('strides', strides1d_options)
 @pytest.mark.parametrize(
-    'backend, io_type, strategy',
+    'backend, io_type, strategy, rf',
     [
-        ('Quartus', 'io_parallel', 'resource'),
-        ('Quartus', 'io_stream', 'resource'),
-        ('oneAPI', 'io_parallel', 'resource'),
-        ('oneAPI', 'io_stream', 'resource'),
-        ('Vivado', 'io_parallel', 'resource'),
-        ('Vitis', 'io_parallel', 'resource'),
-        ('Vivado', 'io_parallel', 'latency'),
-        ('Vitis', 'io_parallel', 'latency'),
-        ('Vivado', 'io_stream', 'latency'),
-        ('Vivado', 'io_stream', 'resource'),
-        ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
-        ('Catapult', 'io_stream', 'latency'),
-        ('Catapult', 'io_stream', 'resource'),
+        ('Quartus', 'io_parallel', 'resource', 1),
+        ('Quartus', 'io_stream', 'resource', 1),
+        ('oneAPI', 'io_parallel', 'resource', 1),
+        ('oneAPI', 'io_stream', 'resource', 1),
+        ('Vivado', 'io_parallel', 'resource', 1),
+        ('Vitis', 'io_parallel', 'resource', 1),
+        ('Vivado', 'io_parallel', 'latency', 1),
+        ('Vitis', 'io_parallel', 'latency', 1),
+        ('Vivado', 'io_parallel', 'latency', 14),
+        ('Vitis', 'io_parallel', 'latency', 14),
+        ('Vivado', 'io_stream', 'latency', 1),
+        ('Vivado', 'io_stream', 'resource', 1),
+        ('Vitis', 'io_stream', 'latency', 1),
+        ('Vitis', 'io_stream', 'resource', 1),
+        ('Catapult', 'io_stream', 'latency', 1),
+        ('Catapult', 'io_stream', 'resource', 1),
     ],
 )
-def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
+def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy, rf):
     model = tf.keras.models.Sequential()
     input_shape = (28, 3)
     model.add(
@@ -50,6 +52,7 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise1d',
         )
     )
     model.compile(optimizer='adam', loss='mse')
@@ -58,14 +61,12 @@ def test_pointwiseconv1d(chans, padds, strides, backend, io_type, strategy):
     keras_prediction = model.predict(X_input)
 
     default_precision = 'fixed<32,16>'
-    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision)
+    config = hls4ml.utils.config_from_keras_model(model, default_precision=default_precision, granularity='name')
     config['Model']['Strategy'] = strategy
+    config['LayerName']['pointwise1d']['ReuseFactor'] = rf
 
     output_dir = str(
-        test_root_path
-        / 'hls4mlprj_pointwise1d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, strides[0], padds, backend, io_type, strategy
-        )
+        test_root_path / f'hls4mlprj_pointwise1d_{chans}_{strides[0]}_{padds}_{backend}_{io_type}_{strategy}_rf{rf}'
     )
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, hls_config=config, output_dir=output_dir, io_type=io_type, backend=backend
@@ -110,6 +111,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
             kernel_initializer='normal',
             use_bias=False,
             data_format=chans,
+            name='pointwise2d',
         )
     )
 
@@ -123,10 +125,7 @@ def test_pointwiseconv2d(chans, padds, strides, backend, io_type, strategy):
     config['Model']['Strategy'] = strategy
     stride_cfg = str(strides).replace(', ', '_').replace('(', '').replace(')', '')
     output_dir = str(
-        test_root_path
-        / 'hls4mlprj_pointwise2d_{}_strides_{}_{}_padding_{}_{}_{}'.format(
-            chans, stride_cfg, padds, backend, io_type, strategy
-        )
+        test_root_path / f'hls4mlprj_pointwise2d_{chans}_strides_{stride_cfg}_{padds}_padding_{backend}_{io_type}_{strategy}'
     )
 
     hls_model = hls4ml.converters.convert_from_keras_model(
diff --git a/test/pytest/test_pytorch_api.py b/test/pytest/test_pytorch_api.py
index b8cce4259f..3056bd13f8 100644
--- a/test/pytest/test_pytorch_api.py
+++ b/test/pytest/test_pytorch_api.py
@@ -63,6 +63,7 @@ def test_linear(backend, io_type):
 @pytest.mark.parametrize(
     "activation_function",
     [
+        nn.Softmax(dim=-1),
         nn.ReLU(),
         nn.Tanh(),
         nn.LeakyReLU(negative_slope=1.0),
@@ -119,6 +120,14 @@ def forward(self, x):
         return nn.functional.relu(x)
 
 
+class SoftmaxModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return nn.functional.softmax(x, dim=-1)
+
+
 class TanHModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -162,6 +171,7 @@ def forward(self, x):
 @pytest.mark.parametrize(
     "activation_function",
     [
+        SoftmaxModel(),
         ReLuModel(),
         TanHModel(),
         LeakyReLuModel(),
diff --git a/test/pytest/test_qkeras.py b/test/pytest/test_qkeras.py
index 3d66107c85..a1ff93292e 100644
--- a/test/pytest/test_qkeras.py
+++ b/test/pytest/test_qkeras.py
@@ -356,8 +356,10 @@ def test_relu_negative_slope(randX_1000_1, quantizer, backend, io_type):
     ],
 )
 def test_qactivation_kwarg(randX_100_10, activation_quantizer, weight_quantizer):
-    if activation_quantizer in ['binary', 'ternary']:
+    if activation_quantizer in ['binary']:
         name = 'bnbt_qdense_alpha'
+    elif activation_quantizer in ['ternary']:
+        name = 'bnbt_qdense_ternary_scale'
     else:
         name = f'qdense_{eval(activation_quantizer).__class__.__name__}'
 
diff --git a/test/pytest/test_qonnx.py b/test/pytest/test_qonnx.py
new file mode 100644
index 0000000000..f48f268626
--- /dev/null
+++ b/test/pytest/test_qonnx.py
@@ -0,0 +1,434 @@
+import os
+import urllib
+from pathlib import Path
+
+import numpy as np
+import pytest
+import qonnx.core.onnx_exec as oxe
+import qonnx.util.cleanup
+import qonnx.util.to_channels_last
+
+# To conveniently run QONNX inference
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.channels_last import ConvertToChannelsLastAndClean
+from qonnx.transformation.gemm_to_matmul import GemmToMatMul
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+example_model_path = (test_root_path / '../../example-models').resolve()
+
+# The models
+
+
+@pytest.fixture(scope='module')
+def tfc_2w2a_model():
+    '''
+    Load the tiny fully-connected model
+    '''
+    dl_dir = test_root_path
+    dl_file = str(dl_dir / "qonnx-tfc-2w2a.onnx")
+    tfc_w2a2_qonnx_url = (
+        "https://raw.githubusercontent.com/fastmachinelearning/"
+        "QONNX_model_zoo/main/models/MNIST/Brevitas_FINN_TFC/TFC/TFC_2W2A.onnx"
+    )
+    urllib.request.urlretrieve(tfc_w2a2_qonnx_url, dl_file)
+    assert os.path.isfile(dl_file)
+    out_file = str(dl_dir / "qonnx-tfc-2w2a-clean.onnx")
+
+    # cleanup
+    qonnx.util.cleanup.cleanup(dl_file, out_file=out_file)
+    model = ModelWrapper(out_file)
+    return model
+
+
+@pytest.fixture(scope='module')
+def cnv_2w2a_model():
+    '''
+    Load the small convolution model
+    '''
+    dl_dir = test_root_path
+    dl_file = str(dl_dir / "qonnx-cnv-2w2a.onnx")
+    cnv_w2a2_qonnx_url = (
+        "https://raw.githubusercontent.com/fastmachinelearning/"
+        "QONNX_model_zoo/main/models/CIFAR10/Brevitas_FINN_CNV/CNV_2W2A.onnx"
+    )
+    urllib.request.urlretrieve(cnv_w2a2_qonnx_url, dl_file)
+    assert os.path.isfile(dl_file)
+    out_clean = str(dl_dir / "qonnx-cnv-2w2a-clean.onnx")
+    out_chanlast = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last.onnx")
+    out_file = str(dl_dir / "qonnx-cnv-2w2a-clean-channels-last-clean.onnx")
+
+    # cleanup
+    qonnx.util.cleanup.cleanup(dl_file, out_file=out_clean)
+    qonnx.util.to_channels_last.to_channels_last(out_clean, make_input_channels_last=True, out_file=out_chanlast)
+    qonnx.util.cleanup.cleanup(out_chanlast, out_file=out_file)
+    model = ModelWrapper(out_file)
+    return model
+
+
+@pytest.fixture(scope='module')
+def jettagging_model():
+    '''
+    Load the 3 hidden layer QKeras example model trained on the jet tagging dataset
+    '''
+    dl_dir = test_root_path
+    dl_file = str(dl_dir / "qkeras_jettagging.onnx")
+    jet_tagging_qonnx_url = (
+        "https://raw.githubusercontent.com/fastmachinelearning/"
+        "QONNX_model_zoo/main/models/JetTagging/QKeras_hls4ml_3layer/qkeras_jettagging.onnx"
+    )
+    urllib.request.urlretrieve(jet_tagging_qonnx_url, dl_file)
+    assert os.path.isfile(dl_file)
+    out_file = str(dl_dir / "qkeras_jettagging-clean.onnx")
+
+    # cleanup
+    qonnx.util.cleanup.cleanup(dl_file, out_file=out_file)
+    model = ModelWrapper(out_file)
+    return model
+
+
+@pytest.fixture(scope='module')
+def sep_conv_model():
+    """
+    Load separabale conv model, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/separable_conv_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+
+    return model
+
+
+@pytest.fixture(scope='module')
+def branched_model():
+    """
+    Load branched model using separable convs, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/branched_model_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+
+    return model
+
+
+@pytest.fixture(scope='module')
+def tiny_unet_model():
+    """
+    Load tiny unet model, already channels-last and cleaned
+    """
+    dl_file = str(example_model_path / "onnx/tiny_unet_ch_last.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+
+    return model
+
+
+@pytest.fixture(scope='module')
+def two_layer_keras_model():
+    """
+    Load a simple, two-layer, originally keras, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/two_layer_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def three_layer_keras_model():
+    """
+    Load a simple, three-layer, originally keras, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/three_layer_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def two_layer_pytorch_model():
+    """
+    Load a simple, two-layer, originally pytorch, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/two_layer_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def three_layer_pytorch_model():
+    """
+    Load a simple, three-layer, originally pytorch, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/three_layer_pytorch.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def conv1d_small_keras_model():
+    """
+    Load a simple conv1d, originally keras, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/conv1d_small_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(ConvertToChannelsLastAndClean())
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def conv2d_small_keras_model():
+    """
+    Load a simple conv2d, originally keras, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/conv2d_small_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(ConvertToChannelsLastAndClean())
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+@pytest.fixture(scope='module')
+def conv2d_small_mp_keras_model():
+    """
+    Load a conv2d model with max pooling, originally keras, unquantized model
+    """
+    dl_file = str(example_model_path / "onnx/conv2d_small_mp_keras.onnx")
+    assert os.path.isfile(dl_file)
+
+    model = ModelWrapper(dl_file)
+    model = qonnx.util.cleanup.cleanup_model(model)
+    model = model.transform(ConvertToChannelsLastAndClean())
+    model = model.transform(GemmToMatMul())
+    model = qonnx.util.cleanup.cleanup_model(model)
+    return model
+
+
+# The actual tests
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+def test_tfc_2w2a(tfc_2w2a_model, backend):
+    model = tfc_2w2a_model
+
+    ishape = (1, 1, 28, 28)
+    X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    # Convert QONNX model, compile, and run inference
+    config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>')
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_tfc-2w2a_{backend}'), backend=backend, hls_config=config
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_cnv_2w2a(cnv_2w2a_model, backend):
+    """
+    This tests a convolution model. Note:  the batch normalizations weights not quantized, so it is
+    difficult to make this match perfectly. It is also a slow test, which is why only Vitis is tested.
+    """
+    model = cnv_2w2a_model
+
+    ishape = (1, 32, 32, 3)
+    X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**6) * 2**-6).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    # Convert QONNX model, compile, and run inference
+    config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,6>')
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_cnv-2w2a_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis', 'Quartus'])
+def test_jet_tagging(jettagging_model, backend):
+    model = jettagging_model
+
+    # Execute QONNX model inference
+    # TODO make the test bigger
+    ishape = (1, 16)
+    X = np.random.uniform(low=-1, high=+1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    # Convert QONNX model, compile, and run inference
+    config = hls4ml.utils.config_from_onnx_model(model, backend=backend, default_precision='fixed<32,16>')
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model, output_dir=str(test_root_path / f'hls4mlprj_qonnx_jettag_{backend}'), backend=backend, hls_config=config
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_sep_conv(sep_conv_model, backend):
+    model = sep_conv_model
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<32,16>'
+    )
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_sep_conv_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+
+    np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_branched_model(branched_model, backend):
+    model = branched_model
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<32,16>'
+    )
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_branched_model_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_tiny_unet_model(tiny_unet_model, backend):
+
+    model = tiny_unet_model
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**16) * 2**-16).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<32,16>'
+    )
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_qonnx_tiny_unet_model_{backend}'),
+        io_type='io_stream',
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(np.ascontiguousarray(X))
+
+    np.testing.assert_array_equal(y_qonnx.ravel(), y_hls4ml.ravel())
+
+
+@pytest.mark.parametrize(
+    'model_name',
+    [
+        'two_layer_keras_model',
+        'three_layer_keras_model',
+        'two_layer_pytorch_model',
+        'three_layer_pytorch_model',
+        'conv1d_small_keras_model',
+        'conv2d_small_keras_model',
+        'conv2d_small_mp_keras_model',
+    ],
+)
+@pytest.mark.parametrize('backend', ['Vitis'])
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+def test_simple_model(model_name, io_type, backend, request):
+    if model_name == 'conv2d_small_mp_keras_model' and io_type == 'io_stream':
+        # Not yet supported due to an issue with channels last conversion
+        # There is a qonnx PR.
+        pytest.skip()
+    model = request.getfixturevalue(model_name)
+    ishape = tuple(model.get_tensor_shape(model.graph.input[0].name))
+    X = np.random.uniform(low=0, high=1, size=np.prod(ishape)).reshape(ishape)
+    X = (np.round(X * 2**10) * 2**-10).astype(np.float32)
+    idict = {model.graph.input[0].name: X}
+    y_qonnx = oxe.execute_onnx(model, idict)[model.graph.output[0].name]
+
+    config = hls4ml.utils.config.config_from_onnx_model(
+        model, granularity='name', backend=backend, default_precision='fixed<16,6>'
+    )
+
+    for layer in config['LayerName']:
+        if layer.startswith('Softmax'):
+            config['LayerName'][layer]['Implementation'] = 'legacy'
+
+    hls_model = hls4ml.converters.convert_from_onnx_model(
+        model,
+        output_dir=str(test_root_path / f'hls4mlprj_onnx_{model_name}_{io_type}_{backend}'),
+        io_type=io_type,
+        backend=backend,
+        hls_config=config,
+    )
+    hls_model.compile()
+    y_hls4ml = hls_model.predict(X)
+
+    np.testing.assert_allclose(y_qonnx.ravel(), y_hls4ml.ravel(), atol=1e-2, rtol=1)
diff --git a/test/pytest/test_sepconv1d.py b/test/pytest/test_sepconv1d.py
index 64312e9932..aef24db040 100644
--- a/test/pytest/test_sepconv1d.py
+++ b/test/pytest/test_sepconv1d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),
diff --git a/test/pytest/test_sepconv2d.py b/test/pytest/test_sepconv2d.py
index 4732c7c7f1..1d056f15c9 100644
--- a/test/pytest/test_sepconv2d.py
+++ b/test/pytest/test_sepconv2d.py
@@ -23,6 +23,7 @@
 @pytest.mark.parametrize(
     'backend, io_type',
     [
+        ('oneAPI', 'io_parallel'),
         ('Vivado', 'io_parallel'),
         ('Vitis', 'io_parallel'),
         ('Vivado', 'io_stream'),