From 85b9531a38eb7717a6878fe75eb10d95261fce55 Mon Sep 17 00:00:00 2001
From: katya <4249113+katyagovorkova@users.noreply.github.com>
Date: Fri, 10 Feb 2023 21:01:47 +0100
Subject: [PATCH 01/12] WIP Add custom KL loss layer HLS implementation (#606)

* add kl layer

* separate hls part; clean up and add docs

* creeate KL layer folder in contrib and move the files there

* pass pre-commit check

* README and fix pre-commit issue

* update readme

* fix formatting

* add readme

* Update README.md

@jmitrevs readme updated!

* Update README.md

remove trailing whitespace

* Update kl_layer.py

* Rename nnet_distance.h to kl_layer.h

* Update README.md

* Update kl_layer.py

* Update kl_layer.h

* fix pre-commit

* Fix KLLoss layer example

---------

Co-authored-by: Jovan Mitrevski <jmitrevs@fnal.gov>
Co-authored-by: Vladimir Loncar <vloncar@users.noreply.github.com>
---
 contrib/kl_layer/README.md   |  18 ++++
 contrib/kl_layer/kl_layer.h  |  87 ++++++++++++++++
 contrib/kl_layer/kl_layer.py | 185 +++++++++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+)
 create mode 100644 contrib/kl_layer/README.md
 create mode 100644 contrib/kl_layer/kl_layer.h
 create mode 100644 contrib/kl_layer/kl_layer.py
diff --git a/contrib/kl_layer/README.md b/contrib/kl_layer/README.md
new file mode 100644
index 0000000000..5d306ae69a
--- /dev/null
+++ b/contrib/kl_layer/README.md
@@ -0,0 +1,18 @@
+This folder contains the implementation of custom KL divergence layer.
+This is a custom implementation and not a built-in layer in any deep learning framework.
+It was developed specifically for [AD@L1 CMS paper](https://www.nature.com/articles/s42256-022-00441-3).
+
+# Files
+
+* `kl_layer.py`: contains the standalone implementation of the custom KL divergence layer
+* `kl_layer.h`: contains the HLS implementation of KL layer
+
+
+# Usage
+
+`kl_layer.py` contains the example of how to use the KL layer.
+To run do
+
+```
+python kl_layer.py
+```
diff --git a/contrib/kl_layer/kl_layer.h b/contrib/kl_layer/kl_layer.h
new file mode 100644
index 0000000000..0435b9a22e
--- /dev/null
+++ b/contrib/kl_layer/kl_layer.h
@@ -0,0 +1,87 @@
+#ifndef KL_LAYER_H_
+#define KL_LAYER_H_
+
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include <cmath>
+#include <cstdlib>
+
+namespace nnet {
+
+struct distance_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+    typedef float sum_t;
+    typedef ap_fixed<18, 8> exp_table_t;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+    static constexpr unsigned exp_range = 8;
+};
+
+template <typename CONFIG_T, int N_TABLE> void init_klloss_exp_table(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (range -1 to +1)
+        float in_val = 2 * CONFIG_T::exp_range * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << " Index: " << ii << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+template <class data1_T, class data2_T, class res_T, typename CONFIG_T>
+void klloss(data1_T mean[CONFIG_T::n_in], data2_T log_var[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    #pragma HLS PIPELINE
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_klloss_exp_table<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        initialized = true;
+    }
+    typename CONFIG_T::accum_t kl[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=kl complete
+    typename CONFIG_T::accum_t mean_sq[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=mean_sq complete
+    typename CONFIG_T::accum_t kl_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS UNROLL
+        mean_sq[i] = mean[i] * mean[i];
+        kl[i] = data2_T(1.) + log_var[i];
+        // std::cout << "Log var: " << log_var[i] << " Result: " << kl[i] << std::endl;
+    }
+    constexpr unsigned table_scale = (unsigned)(CONFIG_T::table_size / (2 * CONFIG_T::exp_range));
+    constexpr unsigned index_scale = (unsigned)(CONFIG_T::exp_range * table_scale);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS UNROLL
+        auto data_round = log_var[i] * table_scale;
+        auto index = data_round + index_scale;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        kl[i] -= exp_table[index];
+        // std::cout << "Exp var: " << exp_table[index] << " Result: " << kl[i] << " Index: " << index << std::endl;
+    }
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS UNROLL
+        kl[i] -= mean_sq[i];
+    }
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    kl_sum = reduce<typename CONFIG_T::accum_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(kl, op_add);
+    // std::cout << "KL sum: " << kl_sum << std::endl;
+    kl_sum *= typename CONFIG_T::accum_t(1. / CONFIG_T::n_in);
+    res[0] = res_T(-0.5) * kl_sum;
+}
+} // namespace nnet
+
+#endif
diff --git a/contrib/kl_layer/kl_layer.py b/contrib/kl_layer/kl_layer.py
new file mode 100644
index 0000000000..ec2af1b797
--- /dev/null
+++ b/contrib/kl_layer/kl_layer.py
@@ -0,0 +1,185 @@
+"""
+    Usage example for a custom KL loss layer
+    Takes as an input two arrays: z_mean and z_log_var
+    and computes KL "distance" between normal distribution
+    and Gaussian with mu=z_mean and sigma=z_log_var
+
+    The HLS part is in contrib/kl_layer/kl_layer.h
+"""
+from pathlib import Path
+
+import numpy as np
+import tensorflow as tf
+
+try:
+    from keras.layers.merge import _Merge as Merge
+except Exception:
+    from keras.layers.merging.base_merge import _Merge as Merge
+
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import math_ops
+
+import hls4ml
+from hls4ml.converters.keras_to_hls import parse_default_keras_layer
+from hls4ml.model.attributes import ConfigurableAttribute, TypeAttribute
+from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
+
+
+# Keras implementation of a KL layer
+class KLLoss(Merge):
+    '''Keras implementation of a KL loss custom layer'''
+
+    @tf_utils.shape_type_conversion
+    def build(self, input_shape):
+        super().build(input_shape)
+
+    def _merge_function(self, inputs):
+
+        mean = inputs[0]
+        log_var = inputs[1]
+
+        kl = 1.0 + log_var - math_ops.square(mean) - math_ops.exp(log_var)
+        kl = -0.5 * math_ops.reduce_mean(kl, axis=-1, keepdims=True)
+
+        return kl
+
+
+# hls4ml implementations
+class HKLLoss(hls4ml.model.layers.Layer):
+    '''hls4ml implementation of a KL loss custom layer'''
+
+    _expected_attributes = [
+        ConfigurableAttribute('table_size', default=1024),
+        ConfigurableAttribute('exp_range', default=8),
+        TypeAttribute('accum'),
+        TypeAttribute(
+            'sum',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
+        TypeAttribute(
+            'exp_table',
+            default=FixedPrecisionType(18, 8, rounding_mode=RoundingMode.RND, saturation_mode=SaturationMode.SAT),
+        ),
+    ]
+
+    def initialize(self):
+        self.add_output_variable(shape=[1], dim_names=[f'KL_LOSS_{self.index}'])
+
+
+# Templates
+distance_config_template = """struct config{index} : nnet::distance_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = 1;
+    typedef {accum_t.name} accum_t;
+    typedef {sum_t.name} sum_t;
+    typedef {exp_table_t.name} exp_table_t;
+    static const unsigned table_size = {table_size};
+    static constexpr float exp_range = {exp_range};
+}};\n"""
+distance_function_template = 'nnet::klloss<{input1_t}, {input2_t}, {output_t}, {config}>({input1}, {input2}, {output});'
+distance_include_list = ['nnet_utils/kl_layer.h']
+
+
+class HKLLossConfigTemplate(hls4ml.backends.template.LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(HKLLoss)
+        self.template = distance_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable(node.inputs[0]).shape[0]
+        params['n_out'] = 1
+        return self.template.format(**params)
+
+
+class HKLLossFunctionTemplate(hls4ml.backends.template.FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(HKLLoss, include_header=distance_include_list)
+        self.template = distance_function_template
+
+    def format(self, node):
+        params = {}
+        params['config'] = f'config{node.index}'
+        params['input1_t'] = node.get_input_variable(node.inputs[0]).type.name
+        params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
+        params['output_t'] = node.get_output_variable().type.name
+        params['input1'] = node.get_input_variable(node.inputs[0]).name
+        params['input2'] = node.get_input_variable(node.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+
+        return self.template.format(**params)
+
+
+# Parser for converter
+def parse_klloss_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'KLLoss' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    output_shape = [input_shapes[0][0], 1]
+
+    return layer, output_shape
+
+
+def main():
+    # Register the converter for custom Keras layer
+    hls4ml.converters.register_keras_layer_handler('KLLoss', parse_klloss_layer)
+
+    # Register the hls4ml's IR layer
+    hls4ml.model.layers.register_layer('KLLoss', HKLLoss)
+
+    # Register the optimization passes (if any)
+    backend = hls4ml.backends.get_backend('Vivado')
+
+    # Register template passes for the given backend
+    backend.register_template(HKLLossConfigTemplate)
+    backend.register_template(HKLLossFunctionTemplate)
+
+    # Register HLS implementation
+    p = Path(__file__).parent / 'kl_layer.h'
+    backend.register_source(p)
+
+    # Test if it works
+    # Create a dummy Keras model with KL loss layer
+    inp = tf.keras.layers.Input(shape=(19, 3, 1))
+    z_mean = tf.keras.layers.Dense(10)(inp)
+    z_log_var = tf.keras.layers.Dense(10)(inp)
+    custom_output = KLLoss()([z_mean, z_log_var])
+    # create new model
+    kmodel = tf.keras.models.Model(inputs=inp, outputs=custom_output)
+    kmodel.summary()
+
+    # test on random inputs
+    x = np.random.randint(-5, 5, (1, 19, 3, 1), dtype='int32')
+    kres = kmodel(x)
+
+    # Create dummy config
+    config = {}
+    config['Model'] = {
+        'Precision': 'ap_fixed<16,6>',
+        'ReuseFactor': 1,
+        'ParallelizationFactor': 1,
+        'Strategy': 'Resource',
+    }
+    hmodel = hls4ml.converters.convert_from_keras_model(
+        kmodel,
+        output_dir='hls4mlprj_kl_layer',
+        backend='Vivado',
+        io_type='io_parallel',
+        part='xcvu9p-flga2577-2-e',
+        hls_config=config,
+    )
+
+    hmodel.compile()
+    hres = hmodel.predict(x.astype('float32'))
+
+    print('Compare prediction by hls4ml model to Keras one')
+    print(kres - hres)
+
+    print('Building model')
+    report = hmodel.build(reset=True, csim=False, cosim=True, synth=True, vsynth=True)
+    print(report)
+
+
+if __name__ == '__main__':
+    main()

From 4d326d5824b0466a14bf8cb4b0758051e684c57c Mon Sep 17 00:00:00 2001
From: Vladimir <vloncar@users.noreply.github.com>
Date: Fri, 10 Feb 2023 21:24:30 +0100
Subject: [PATCH 02/12] Fix incorrectly linted build command (#709)

---
 hls4ml/backends/vivado/vivado_backend.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index b1b586f6c4..793a1d24be 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -189,13 +189,13 @@ def build(
         curr_dir = os.getcwd()
         os.chdir(model.config.get_output_dir())
         vivado_cmd = (
-            f'vivado_hls -f build_prj.tcl "reset={reset}'
-            f'csim={csim}'
-            f'synth={synth}'
-            f'cosim={cosim}'
-            f'validation={validation}'
-            f'export={export}'
-            f'vsynth={vsynth}'
+            f'vivado_hls -f build_prj.tcl "reset={reset} '
+            f'csim={csim} '
+            f'synth={synth} '
+            f'cosim={cosim} '
+            f'validation={validation} '
+            f'export={export} '
+            f'vsynth={vsynth} '
             f'fifo_opt={fifo_opt}"'
         )
         os.system(vivado_cmd)

From 5a586d0be6d33c4d27f300be188566f7ff0d747c Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 09:43:38 -0800
Subject: [PATCH 03/12] start updating docs

---
 docs/extension.rst     |   5 ++
 docs/flows.rst         |   6 +++
 docs/index.rst         |   2 +
 docs/release_notes.rst | 115 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 docs/extension.rst
 create mode 100644 docs/flows.rst

diff --git a/docs/extension.rst b/docs/extension.rst
new file mode 100644
index 0000000000..dc6e6e1609
--- /dev/null
+++ b/docs/extension.rst
@@ -0,0 +1,5 @@
+========================
+Extension API
+========================
+
+- Describe extension API by walking through ``test_extensions.py`` @jmduarte
\ No newline at end of file
diff --git a/docs/flows.rst b/docs/flows.rst
new file mode 100644
index 0000000000..13ccc666cb
--- /dev/null
+++ b/docs/flows.rst
@@ -0,0 +1,6 @@
+========
+Flows and Optimizers
+========
+
+- Explain concept of flows and optimizers 
+- Describe FIFO buffer optimizer as an example?
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index da7e445348..6299c5fbbf 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,6 +7,8 @@
     setup 
     command
     concepts
+    flows
+    extension
     reference
 
 .. toctree::
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 5db2480be9..3dedb2fa16 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -6,6 +6,121 @@ See `here <https://github.com/fastmachinelearning/hls4ml/releases>`__ for offici
 
 ----
 
+**v0.7.0 / TBD**
+
+What's changed:
+
+* GarNet and GarNetStack in config.py by @yiiyama in https://github.com/fastmachinelearning/hls4ml/pull/344
+* support ZeroPadding layers by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/480
+* New backend development framework by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/395
+* Register ``ApplyAlpha`` layer templates by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/499
+* Parsing extended by @nicologhielmetti in https://github.com/fastmachinelearning/hls4ml/pull/501
+* Remove intermediate casting in product by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/490
+* Add QKeras as a package dependency by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/511
+* Copy flows from config by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/510
+* VivadoAccelerator backend updates by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/508
+* Optimized look-up table by @nemerchiedde in https://github.com/fastmachinelearning/hls4ml/pull/527
+* Upsampling2D test case by @ChiRuiChen in https://github.com/fastmachinelearning/hls4ml/pull/520
+* Support UpSampling1D by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/475
+* RNN support (part 1) by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/521
+* Quartus Custom Matrix Multiplication & Quantization by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/523
+* Vivado-equivalent implementation of Softmax on Quartus by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/540
+* Ensure 2 bits for scale in po2 quantizers by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/531
+* Link update by @bkmgit in https://github.com/fastmachinelearning/hls4ml/pull/519
+* Fix removal of nodes ingested by multiple downstream nodes by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/544
+* Enable SeparableConv2d by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/547
+* Extension API by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/528
+* change string ReuseFactor to int by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/416
+* Make the size of bn scale and bias what they really are by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/532
+* Raise runtime error when a layer is named `input` by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/482
+* fix insertion before a node with multiple inputs + support additional broadcasting by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/551
+* Pointwise conv1d/2d resource by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/471
+* Quartus Embedding Layer by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/548
+* Fix for QActivations passed as an argument by @AdrianAlan in https://github.com/fastmachinelearning/hls4ml/pull/553
+* Don't override precision directly in the QKeras optimizer by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/567
+* Remove the in/out size from top function by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/559
+* Transpose2d, Concatenate2d, and up to 3 Clones for io_stream by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/402
+* Remove io_serial as io_stream and add some more info in docs. by @Duchstf in https://github.com/fastmachinelearning/hls4ml/pull/334
+* Update docs for v0.6.0 by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/453
+* Use correct number of args for multiple outputs by @apfusco in https://github.com/fastmachinelearning/hls4ml/pull/487
+* Fixed a few typos in the documentation  by @pitmonticone in https://github.com/fastmachinelearning/hls4ml/pull/467
+* returning integer from _compute_n_samples by @JochiSt in https://github.com/fastmachinelearning/hls4ml/pull/537
+* Providing support for Alveo boards by @selwyn96 in https://github.com/fastmachinelearning/hls4ml/pull/552
+* Make layer names case sensitive in config. by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/577
+* Add issue and PR templates by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/582
+* Vivado Backend GRU/LSTM support by @drankincms in https://github.com/fastmachinelearning/hls4ml/pull/560
+* Update CI template syntax by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/593
+* Update flow dependencies by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/588
+* Fix parsing of ZeroPadding layers by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/595
+* remove cppname by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/562
+* Remove email helpline from the docs by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/601
+* Fixes for GRU/LSTM in Vivado backend by @drankincms in https://github.com/fastmachinelearning/hls4ml/pull/598
+* Remove io_serial by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/609
+* Fix test_graph by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/611
+* Override parent backend optimizer passes with derived backend passes by @thesps in https://github.com/fastmachinelearning/hls4ml/pull/597
+* Enforce function pipelining when using io_parallel with Resource strategy by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/605
+* FIFO depth optimization by @nicologhielmetti in https://github.com/fastmachinelearning/hls4ml/pull/509
+* Add tracing support for the quartus backend by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/583
+* Quartus streaming support for Activations, Dense & Batch Normalization by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/557
+* QConv alpha != 1 bug fix by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/612
+* Quartus Stream Embedding by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/625
+* change master to main by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/602
+* Edit order of the optimizers in the flow so that BramFactor is followed by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/621
+* Softmax LUT Optimization by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/570
+* Quartus Synthesis Flow Improvement by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/618
+* Quartus Extensions by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/628
+* Quartus GRU by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/596
+* Quartus Merge layers by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/634
+* fix nondefault project name handling by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/626
+* Fix parsing of logic synthesis reports by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/639
+* Fix conv1d stream implementation hls directives by @Jonathan-Shoemaker in https://github.com/fastmachinelearning/hls4ml/pull/635
+* Implementation and optimizations linked to Simple-RNN and LSTM for qu… by @nemerchiedde in https://github.com/fastmachinelearning/hls4ml/pull/575
+* Softsign optimization by @nemerchiedde in https://github.com/fastmachinelearning/hls4ml/pull/585
+* Parallel CNNs, Pooling & Image Layers for Quartus Backend by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/561
+* Quartus Streaming Softsign (PR #585 contd.) by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/655
+* Remove final reshapes even for Quartus by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/661
+* Unrolled CNN implementation by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/600
+* the strategy was not propagated in the pytest by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/663
+* Fix keras model loading issue with loading model with KerasH5 by @calad0i in https://github.com/fastmachinelearning/hls4ml/pull/664
+* append applied_flows container before filling instead of after by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/641
+* set version using ``setuptools_scm`` by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/479
+* Argmax Softmax by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/627
+* Fix version extraction in Sphinx config by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/669
+* Add requested citations to README by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/615
+* skip BatchNorm fusion when input/output is used multiple times by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/481
+* Use wider accum_t for (average) pooling by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/681
+* Quartus Streaming Conv, Pooling & Image layers by @bo3z in https://github.com/fastmachinelearning/hls4ml/pull/656
+* Create branch on PR by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/636
+* Delete ``example-prjs`` directory by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/682
+* Adiabatically turn on `pre-commit` by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/678
+* Add causal padding by @cgutsche in https://github.com/fastmachinelearning/hls4ml/pull/688
+* Update ``pre-commit`` GitHub Action by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/689
+* New config_from_keras_model by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/690
+* remove obsolete np.int and np.float by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/703
+* Update p-clang-format to work on mac by @jmduarte in https://github.com/fastmachinelearning/hls4ml/pull/704
+* Fix function call in Alveo tcl script by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/694
+* add readme for contrib by @jmitrevs in https://github.com/fastmachinelearning/hls4ml/pull/706
+* WIP Add custom KL loss layer HLS implementation by @katyagovorkova in https://github.com/fastmachinelearning/hls4ml/pull/606
+* Fix incorrectly linted build() command by @vloncar in https://github.com/fastmachinelearning/hls4ml/pull/709
+
+New contributors:
+
+* @nemerchiedde made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/527
+* @ChiRuiChen made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/520
+* @bo3z made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/523
+* @bkmgit made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/519
+* @apfusco made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/487
+* @pitmonticone made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/467
+* @JochiSt made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/537
+* @selwyn96 made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/552
+* @Jonathan-Shoemaker made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/635
+* @calad0i made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/664
+* @cgutsche made their first contribution in https://github.com/fastmachinelearning/hls4ml/pull/688
+
+**Full Changelog**: https://github.com/fastmachinelearning/hls4ml/compare/v0.6.0...v0.7.0
+
+----
+
 **v0.6.0 / coris**
 
 What's changed:

From 318ba8388efd2bebb8b42ea93e7e6238d59b3a01 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 09:59:42 -0800
Subject: [PATCH 04/12] update

---
 docs/reference.rst |  5 +----
 docs/status.rst    | 27 ++++++++++++++++++---------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/docs/reference.rst b/docs/reference.rst
index 8216e0157e..8cd5222a8a 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -16,11 +16,8 @@ If you are using the package please cite:
 
 * J. Duarte *et al.*\ , "Fast inference of deep neural networks in FPGAs for particle physics", `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_\ , `arXiv:1804.06913 <https://arxiv.org/abs/1804.06913>`_.
 
-If you are using the boosted decision tree implementation, please cite also:   
+If you are using the binary or ternary neural network implementation, please also cite:
 
-
-* S. Summers *et al.*\ , "Fast inference of boosted decision trees in FPGAs for particle physics", `arXiv:2002.02534 <https://arxiv.org/abs/2002.02534>`_.
-  If you are using the binary or ternary neural network implementation, please also cite:
 * G. Di Guglielmo *et al.*\ , "Compressing deep neural networks on FPGAs to binary and ternary precision with hls4ml", `arXiv:2003.06308 <https://arxiv.org/abs/2003.06308>`_
 
 Additional Talks and Presentations
diff --git a/docs/status.rst b/docs/status.rst
index e91b605add..fd44e82846 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -5,7 +5,7 @@ Status and Features
 Status
 ========
 
-The latest stable release is :doc:`v0.6.0 <release_notes>`. This release brings the new VivadoAccelerator backend to easily target boards like pynq-z2 and zcu102, with support for more boards like Alveo planned.
+The latest stable release is :doc:`v0.7.0 <release_notes>`.
 
 
 Features
@@ -15,13 +15,22 @@ A list of supported ML codes and architectures, including a summary table is bel
 
 ML code support: 
 
-* Keras/Tensorflow/QKeras, PyTorch, Onnx
+* Keras/Tensorflow/QKeras
+* PyTorch (limited)
+* (Q)ONNX (in development)
 
 Neural network architectures:
 
-* Fully Connected NNs (multi-layer perceptron)
-* Convolutional NNs (1D/2D)
-* Recurrent NN/LSTM, in prototyping
+* Fully connected NNs (multilayer perceptron, MLP)
+* Convolutional NNs (1D and 2D)
+* Recurrent NN (LSTM)
+* Graph NN (GarNet)
+
+HLS backends:
+
+* Vivado HLS
+* Vitis HLS (experimental)
+* Intel HLS
 
 A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
@@ -31,15 +40,15 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
    * - Architectures/Toolkits
      - Keras/TensorFlow/QKeras
      - PyTorch
-     - ONNX
+     - (Q)ONNX
    * - MLP
      - ``supported``
      - ``supported``
      - ``supported``
-   * - Conv1D/Conv2D
+   * - CNN
      - ``supported``
      - ``in development``
-     - ``in development`` 
+     - ``in development``
    * - RNN/LSTM
      - ``in development``
      - ``in development``
@@ -48,7 +57,7 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 
 Other feature notes:
 
-* ``hls4ml`` is tested on Linux, and supports Vivado HLS versions 2018.2 to 2020.1. Vitis HLS is not yet supported. Windows and macOS are not supported.
+* ``hls4ml`` is tested on Linux, and supports Vivado HLS versions 2018.2 to 2020.1 and Intel HLS versions XXX. Vitis HLS is experimentally supported in v0.7.0. Windows and macOS are not supported.
 
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 

From 97ee56adadf8d1ac9e9247c969c3464445819f6f Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 12:35:58 -0800
Subject: [PATCH 05/12] Add

---
 .all-contributorsrc    | 23 +++++++++++++++++++++++
 README.md              | 13 ++++++++++++-
 docs/extension.rst     |  2 +-
 docs/flows.rst         |  4 ++--
 docs/index.rst         | 10 +++++-----
 docs/reference.rst     |  5 ++---
 docs/release_notes.rst | 18 ++++++++----------
 docs/status.rst        |  3 +--
 8 files changed, 54 insertions(+), 24 deletions(-)
 create mode 100644 .all-contributorsrc

diff --git a/.all-contributorsrc b/.all-contributorsrc
new file mode 100644
index 0000000000..8d5eedede4
--- /dev/null
+++ b/.all-contributorsrc
@@ -0,0 +1,23 @@
+{
+  "projectName": "hls4ml",
+  "projectOwner": "Fast ML Team",
+  "repoType": "github",
+  "repoHost": "https://github.com",
+  "files": ["README.md"],
+  "imageSize": 100,
+  "commit": false,
+  "contributorsPerLine": 7,
+  "contributorsSortAlphabetically": true,
+  "badgeTemplate": "[![All Contributors](https://img.shields.io/github/all-contributors/<%= projectOwner %>/<%= projectName %>?color=ee8449&style=flat-square)](#contributors)",
+  "contributorTemplate": "<a href=\"<%= contributor.profile %>\"><img src=\"<%= contributor.avatar_url %>\" width=\"<%= options.imageSize %>px;\" alt=\"\"/><br /><sub><b><%= contributor.name %></b></sub></a>",
+  "types": {
+    "custom": {
+      "symbol": "🔭",
+      "description": "A custom contribution type.",
+      "link": "[<%= symbol %>](<%= url %> \"<%= description %>\"),"
+    }
+  },
+  "linkToUsage": true,
+  "skipCi": true,
+  "contributors": []
+}
diff --git a/README.md b/README.md
index e1bbdc1219..6a22a9792c 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 [![DOI](https://zenodo.org/badge/108329371.svg)](https://zenodo.org/badge/latestdoi/108329371)
 [![PyPI version](https://badge.fury.io/py/hls4ml.svg)](https://badge.fury.io/py/hls4ml)
 [![Supported Python versions](https://img.shields.io/pypi/pyversions/hls4ml.svg)](https://pypi.org/project/hls4ml/)
+[![All Contributors](https://img.shields.io/github/all-contributors/projectOwner/projectName?color=ee8449&style=flat-square)](#contributors)
 
 A package for machine learning inference in FPGAs. We create firmware implementations of machine learning algorithms using high level synthesis language (HLS). We translate traditional open-source machine learning package models into HLS that can be configured for your use-case!
 
@@ -21,7 +22,7 @@ Detailed tutorials on how to use `hls4ml`'s various functionalities can be found
 pip install hls4ml
 ```
 
-To install the extra dependencies for profiling: 
+To install the extra dependencies for profiling:
 
 ```
 pip install hls4ml[profiling]
@@ -130,3 +131,13 @@ binary/ternary networks:
     year = "2021"
 }
 ```
+## Contributors
+
+<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
+<!-- prettier-ignore-start -->
+<!-- markdownlint-disable -->
+
+<!-- markdownlint-restore -->
+<!-- prettier-ignore-end -->
+
+<!-- ALL-CONTRIBUTORS-LIST:END -->
diff --git a/docs/extension.rst b/docs/extension.rst
index dc6e6e1609..71936bf187 100644
--- a/docs/extension.rst
+++ b/docs/extension.rst
@@ -2,4 +2,4 @@
 Extension API
 ========================
 
-- Describe extension API by walking through ``test_extensions.py`` @jmduarte
\ No newline at end of file
+- Describe extension API by walking through ``test_extensions.py`` @jmduarte
diff --git a/docs/flows.rst b/docs/flows.rst
index 13ccc666cb..1336464355 100644
--- a/docs/flows.rst
+++ b/docs/flows.rst
@@ -2,5 +2,5 @@
 Flows and Optimizers
 ========
 
-- Explain concept of flows and optimizers 
-- Describe FIFO buffer optimizer as an example?
\ No newline at end of file
+- Explain concept of flows and optimizers
+- Describe FIFO buffer optimizer as an example?
diff --git a/docs/index.rst b/docs/index.rst
index 6299c5fbbf..d99ba38d98 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,10 +1,10 @@
 .. toctree::
     :hidden:
-    
+
     Home <self>
     release_notes
     status
-    setup 
+    setup
     command
     concepts
     flows
@@ -25,7 +25,7 @@
 
     autodoc/hls4ml
     autodoc/hls4ml.*
-   
+
 
 ==================================
 Welcome to hls4ml's documentation!
@@ -42,7 +42,7 @@ The project is currently in development, so please let us know if you are intere
 
 Project Status
 =================================
-For the latest status including current and planned features, see the :doc:`Status and Features <status>` page. 
+For the latest status including current and planned features, see the :doc:`Status and Features <status>` page.
 
 Tutorials
 =================================
@@ -68,7 +68,7 @@ If you use this software in a publication, please cite the software
 and first publication:
 
 ..  code-block:: bibtex
-    
+
     @article{Duarte:2018ite,
         author = "Duarte, Javier and others",
         title = "{Fast inference of deep neural networks in FPGAs for particle physics}",
diff --git a/docs/reference.rst b/docs/reference.rst
index 8cd5222a8a..6cfea35f1e 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -9,7 +9,7 @@ Citation
 If you are using the package please cite:
 
 
-* 
+*
   .. image:: https://zenodo.org/badge/108329371.svg
      :target: https://zenodo.org/badge/latestdoi/108329371
      :alt: DOI
@@ -31,7 +31,7 @@ Additional Talks and Presentations
 * CHEP 2018: `talk <https://indico.cern.ch/event/587955/contributions/2937529/>`__
 * Connecting the Dots 2018: `talk <https://indico.cern.ch/event/658267/contributions/2813688/>`__
 * Fermilab Research Techniques Seminar: `talk <https://indico.fnal.gov/event/16908/>`__
-* CERN EP/IT Data Science Seminar: `talk <https://indico.cern.ch/event/721567/>`__ 
+* CERN EP/IT Data Science Seminar: `talk <https://indico.cern.ch/event/721567/>`__
 
 Contributors
 ============
@@ -48,4 +48,3 @@ Contributors
 * Giuseppe Di Guglielmo [Columbia University]
 * Duc Hoang [Rhodes College]
 * Noah Paladino [Rutgers University]
-
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index 3dedb2fa16..b0cbe6c3d6 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -126,14 +126,14 @@ New contributors:
 What's changed:
 
 * ``VivadoAccelerator`` backend: target ``pynq-z2`` and ``zcu102`` boards directly from hls4ml by @nicologhielmetti
-* Updated ``PyTorch`` and ``ONNX`` converters by @Duchstf 
-* ``line_buffer`` Conv2D implementation for ``io_stream``: reduced resource usage and latency by @Keb-L, @violatingcp, @vloncar 
-* Support ``QConv2DBatchnorm`` layer from ``QKeras`` by @nicologhielmetti 
-* Improved profiling plots - easier to compare original vs ``hls4ml`` converted models by @maksgraczyk 
-* Better derivation of data types for ``QKeras`` models by @jmduarte, @thesps 
+* Updated ``PyTorch`` and ``ONNX`` converters by @Duchstf
+* ``line_buffer`` Conv2D implementation for ``io_stream``: reduced resource usage and latency by @Keb-L, @violatingcp, @vloncar
+* Support ``QConv2DBatchnorm`` layer from ``QKeras`` by @nicologhielmetti
+* Improved profiling plots - easier to compare original vs ``hls4ml`` converted models by @maksgraczyk
+* Better derivation of data types for ``QKeras`` models by @jmduarte, @thesps
 * Improved CI by @thesps
-* More support for models with branches, skip connections, ``Merge`` and ``Concatenate`` layers by @jmduarte, @vloncar 
-* Support for ``Dense`` layers over multi-dimensional tensors by @vloncar 
+* More support for models with branches, skip connections, ``Merge`` and ``Concatenate`` layers by @jmduarte, @vloncar
+* Support for ``Dense`` layers over multi-dimensional tensors by @vloncar
 * Overall improvements by @vloncar, @jmduarte, @thesps, @jmitrevs & others
 
 New contributors:
@@ -248,8 +248,6 @@ Bugfixes:
 **v0.0.2**\ : first alpha release
 
 
-* full translation of DNNs from Keras 
+* full translation of DNNs from Keras
 * an example Conv1D exists
 * parallel mode is supported (serial mode, not yet)
-
-
diff --git a/docs/status.rst b/docs/status.rst
index fd44e82846..b76d08584d 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -13,7 +13,7 @@ Features
 
 A list of supported ML codes and architectures, including a summary table is below.  Dependencies are given in the :doc:`Setup <setup>` page.
 
-ML code support: 
+ML code support:
 
 * Keras/Tensorflow/QKeras
 * PyTorch (limited)
@@ -65,4 +65,3 @@ Example Models
 ==============
 
 We also provide and documented several example models that have been implemented in ``hls4ml`` in `this Github repository <https://github.com/fastmachinelearning/example-models>`_.
-

From 1f28a5bec61794b40b6d002a31c746187191cdd3 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 12:45:39 -0800
Subject: [PATCH 06/12] test

---
 .all-contributorsrc | 2 +-
 README.md           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.all-contributorsrc b/.all-contributorsrc
index 8d5eedede4..f16bf6e866 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -1,6 +1,6 @@
 {
   "projectName": "hls4ml",
-  "projectOwner": "Fast ML Team",
+  "projectOwner": "jmduarte",
   "repoType": "github",
   "repoHost": "https://github.com",
   "files": ["README.md"],
diff --git a/README.md b/README.md
index 6a22a9792c..3b955d754f 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![DOI](https://zenodo.org/badge/108329371.svg)](https://zenodo.org/badge/latestdoi/108329371)
 [![PyPI version](https://badge.fury.io/py/hls4ml.svg)](https://badge.fury.io/py/hls4ml)
 [![Supported Python versions](https://img.shields.io/pypi/pyversions/hls4ml.svg)](https://pypi.org/project/hls4ml/)
-[![All Contributors](https://img.shields.io/github/all-contributors/projectOwner/projectName?color=ee8449&style=flat-square)](#contributors)
+[![All Contributors](https://img.shields.io/github/all-contributors/jmduarte/hls4ml?color=ee8449&style=flat-square)](#contributors)
 
 A package for machine learning inference in FPGAs. We create firmware implementations of machine learning algorithms using high level synthesis language (HLS). We translate traditional open-source machine learning package models into HLS that can be configured for your use-case!
 

From e60f1d7bb5203dbbe0784734f0f26fcff2258a84 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 12:59:24 -0800
Subject: [PATCH 07/12] update

---
 .all-contributorsrc |   9 +---
 README.md           |   2 +-
 docs/index.rst      |  82 ----------------------------------
 docs/reference.rst  | 105 ++++++++++++++++++++++++++++++--------------
 4 files changed, 75 insertions(+), 123 deletions(-)

diff --git a/.all-contributorsrc b/.all-contributorsrc
index f16bf6e866..5444d26189 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -1,6 +1,6 @@
 {
   "projectName": "hls4ml",
-  "projectOwner": "jmduarte",
+  "projectOwner": "fastmachinelearning",
   "repoType": "github",
   "repoHost": "https://github.com",
   "files": ["README.md"],
@@ -10,13 +10,6 @@
   "contributorsSortAlphabetically": true,
   "badgeTemplate": "[![All Contributors](https://img.shields.io/github/all-contributors/<%= projectOwner %>/<%= projectName %>?color=ee8449&style=flat-square)](#contributors)",
   "contributorTemplate": "<a href=\"<%= contributor.profile %>\"><img src=\"<%= contributor.avatar_url %>\" width=\"<%= options.imageSize %>px;\" alt=\"\"/><br /><sub><b><%= contributor.name %></b></sub></a>",
-  "types": {
-    "custom": {
-      "symbol": "🔭",
-      "description": "A custom contribution type.",
-      "link": "[<%= symbol %>](<%= url %> \"<%= description %>\"),"
-    }
-  },
   "linkToUsage": true,
   "skipCi": true,
   "contributors": []
diff --git a/README.md b/README.md
index 3b955d754f..d5f24ca73e 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![DOI](https://zenodo.org/badge/108329371.svg)](https://zenodo.org/badge/latestdoi/108329371)
 [![PyPI version](https://badge.fury.io/py/hls4ml.svg)](https://badge.fury.io/py/hls4ml)
 [![Supported Python versions](https://img.shields.io/pypi/pyversions/hls4ml.svg)](https://pypi.org/project/hls4ml/)
-[![All Contributors](https://img.shields.io/github/all-contributors/jmduarte/hls4ml?color=ee8449&style=flat-square)](#contributors)
+[![All Contributors](https://img.shields.io/github/all-contributors/fastmachinelearning/hls4ml?color=ee8449&style=flat-square)](#contributors)
 
 A package for machine learning inference in FPGAs. We create firmware implementations of machine learning algorithms using high level synthesis language (HLS). We translate traditional open-source machine learning package models into HLS that can be configured for your use-case!
 
diff --git a/docs/index.rst b/docs/index.rst
index d99ba38d98..c766f18365 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -49,85 +49,3 @@ Tutorials
 Detailed tutorials on how to use ``hls4ml``'s various functionalities can be found at:
 
 https://github.com/fastmachinelearning/hls4ml-tutorial
-
-Citation
-=================================
-If you use this software in a publication, please cite the software
-
-..  code-block:: bibtex
-
-    @software{vloncar_2021_5680908,
-    author       = {{FastML Team}},
-    title        = {fastmachinelearning/hls4ml},
-    year         = 2021,
-    publisher    = {Zenodo},
-    doi          = {10.5281/zenodo.1201549},
-    url          = {https://github.com/fastmachinelearning/hls4ml}
-    }
-
-and first publication:
-
-..  code-block:: bibtex
-
-    @article{Duarte:2018ite,
-        author = "Duarte, Javier and others",
-        title = "{Fast inference of deep neural networks in FPGAs for particle physics}",
-        eprint = "1804.06913",
-        archivePrefix = "arXiv",
-        primaryClass = "physics.ins-det",
-        reportNumber = "FERMILAB-PUB-18-089-E",
-        doi = "10.1088/1748-0221/13/07/P07027",
-        journal = "JINST",
-        volume = "13",
-        number = "07",
-        pages = "P07027",
-        year = "2018"
-    }
-
-Additionally, if you use specific features developed in later papers, please cite those as well. For example, CNNs:
-
-..  code-block:: bibtex
-
-    @article{Aarrestad:2021zos,
-        author = "Aarrestad, Thea and others",
-        title = "{Fast convolutional neural networks on FPGAs with hls4ml}",
-        eprint = "2101.05108",
-        archivePrefix = "arXiv",
-        primaryClass = "cs.LG",
-        reportNumber = "FERMILAB-PUB-21-130-SCD",
-        doi = "10.1088/2632-2153/ac0ea1",
-        journal = "Mach. Learn. Sci. Tech.",
-        volume = "2",
-        number = "4",
-        pages = "045015",
-        year = "2021"
-    }
-    @article{Ghielmetti:2022ndm,
-        author = "Ghielmetti, Nicol\`{o} and others",
-        title = "{Real-time semantic segmentation on FPGAs for autonomous vehicles with hls4ml}",
-        eprint = "2205.07690",
-        archivePrefix = "arXiv",
-        primaryClass = "cs.CV",
-        reportNumber = "FERMILAB-PUB-22-435-PPD",
-        doi = "10.1088/2632-2153/ac9cb5",
-        journal ="Mach. Learn. Sci. Tech.",
-        year = "2022"
-    }
-
-binary/ternary networks:
-
-..  code-block:: bibtex
-
-    @article{Loncar:2020hqp,
-        author = "Ngadiuba, Jennifer and others",
-        title = "{Compressing deep neural networks on FPGAs to binary and ternary precision with HLS4ML}",
-        eprint = "2003.06308",
-        archivePrefix = "arXiv",
-        primaryClass = "cs.LG",
-        reportNumber = "FERMILAB-PUB-20-167-PPD-SCD",
-        doi = "10.1088/2632-2153/aba042",
-        journal = "Mach. Learn. Sci. Tech.",
-        volume = "2",
-        pages = "015001",
-        year = "2021"
-    }
diff --git a/docs/reference.rst b/docs/reference.rst
index 6cfea35f1e..dd983081e8 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -1,50 +1,91 @@
 ============================
-Reference and Contributors
+Citation and Contributors
 ============================
 
 
 Citation
-========
+=================================
+If you use this software in a publication, please cite the software
 
-If you are using the package please cite:
+..  code-block:: bibtex
 
+    @software{vloncar_2021_5680908,
+    author       = {{FastML Team}},
+    title        = {fastmachinelearning/hls4ml},
+    year         = 2021,
+    publisher    = {Zenodo},
+    doi          = {10.5281/zenodo.1201549},
+    url          = {https://github.com/fastmachinelearning/hls4ml}
+    }
 
-*
-  .. image:: https://zenodo.org/badge/108329371.svg
-     :target: https://zenodo.org/badge/latestdoi/108329371
-     :alt: DOI
+and first publication:
 
-* J. Duarte *et al.*\ , "Fast inference of deep neural networks in FPGAs for particle physics", `JINST 13 P07027 (2018) <https://dx.doi.org/10.1088/1748-0221/13/07/P07027>`_\ , `arXiv:1804.06913 <https://arxiv.org/abs/1804.06913>`_.
+..  code-block:: bibtex
 
-If you are using the binary or ternary neural network implementation, please also cite:
+    @article{Duarte:2018ite,
+        author = "Duarte, Javier and others",
+        title = "{Fast inference of deep neural networks in FPGAs for particle physics}",
+        eprint = "1804.06913",
+        archivePrefix = "arXiv",
+        primaryClass = "physics.ins-det",
+        reportNumber = "FERMILAB-PUB-18-089-E",
+        doi = "10.1088/1748-0221/13/07/P07027",
+        journal = "JINST",
+        volume = "13",
+        number = "07",
+        pages = "P07027",
+        year = "2018"
+    }
 
-* G. Di Guglielmo *et al.*\ , "Compressing deep neural networks on FPGAs to binary and ternary precision with hls4ml", `arXiv:2003.06308 <https://arxiv.org/abs/2003.06308>`_
+Additionally, if you use specific features developed in later papers, please cite those as well. For example, CNNs:
 
-Additional Talks and Presentations
-==================================
+..  code-block:: bibtex
 
+    @article{Aarrestad:2021zos,
+        author = "Aarrestad, Thea and others",
+        title = "{Fast convolutional neural networks on FPGAs with hls4ml}",
+        eprint = "2101.05108",
+        archivePrefix = "arXiv",
+        primaryClass = "cs.LG",
+        reportNumber = "FERMILAB-PUB-21-130-SCD",
+        doi = "10.1088/2632-2153/ac0ea1",
+        journal = "Mach. Learn. Sci. Tech.",
+        volume = "2",
+        number = "4",
+        pages = "045015",
+        year = "2021"
+    }
+    @article{Ghielmetti:2022ndm,
+        author = "Ghielmetti, Nicol\`{o} and others",
+        title = "{Real-time semantic segmentation on FPGAs for autonomous vehicles with hls4ml}",
+        eprint = "2205.07690",
+        archivePrefix = "arXiv",
+        primaryClass = "cs.CV",
+        reportNumber = "FERMILAB-PUB-22-435-PPD",
+        doi = "10.1088/2632-2153/ac9cb5",
+        journal ="Mach. Learn. Sci. Tech.",
+        year = "2022"
+    }
 
-* eScience 2019: `talk <https://escience2019.sched.com/event/Uuiy/machine-learning-on-fpgas-for-low-latency-and-high-throughput-inference?iframe=yes&w=100%&sidebar=yes&bg=no#>`__
-* ACAT 2019: `talk <https://indico.cern.ch/event/708041/contributions/3269690/>`__
-* Zurich Hands-on Course: `course <https://indico.cern.ch/event/769727/>`__
-* TWEPP 2018: `talk <https://indico.cern.ch/event/697988/contributions/3055990/>`__
-* CHEP 2018: `talk <https://indico.cern.ch/event/587955/contributions/2937529/>`__
-* Connecting the Dots 2018: `talk <https://indico.cern.ch/event/658267/contributions/2813688/>`__
-* Fermilab Research Techniques Seminar: `talk <https://indico.fnal.gov/event/16908/>`__
-* CERN EP/IT Data Science Seminar: `talk <https://indico.cern.ch/event/721567/>`__
+binary/ternary networks:
+
+..  code-block:: bibtex
+
+    @article{Loncar:2020hqp,
+        author = "Ngadiuba, Jennifer and others",
+        title = "{Compressing deep neural networks on FPGAs to binary and ternary precision with HLS4ML}",
+        eprint = "2003.06308",
+        archivePrefix = "arXiv",
+        primaryClass = "cs.LG",
+        reportNumber = "FERMILAB-PUB-20-167-PPD-SCD",
+        doi = "10.1088/2632-2153/aba042",
+        journal = "Mach. Learn. Sci. Tech.",
+        volume = "2",
+        pages = "015001",
+        year = "2021"
+    }
 
 Contributors
 ============
 
-
-* Vladimir Loncar, Jennifer Ngadiuba, Maurizio Pierini, Sioni Summers [CERN]
-* Javier Duarte [University of California San Diego]
-* Sergo Jindariani, Benjamin Kreis, Ryan Rivera, Nhan Tran [Fermilab]
-* Edward Kreinar [Hawkeye360]
-* Song Han, Philip Harris, Dylan Rankin [MIT]
-* Zhenbin Wu [University of Illinois at Chicago]
-* Mark Neubauer [University of Illinois Urbana-Champaign]
-* Shih-Chieh Hsu [University of Washington]
-* Giuseppe Di Guglielmo [Columbia University]
-* Duc Hoang [Rhodes College]
-* Noah Paladino [Rutgers University]
+Check the README for the full list of contributors!

From a08f8219b43ecf165de687033213c0eeee4bffc5 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 14:44:21 -0800
Subject: [PATCH 08/12] update

---
 .all-contributorsrc | 16 ----------------
 README.md           | 32 ++++++++++++--------------------
 docs/conf.py        | 10 ++++++----
 docs/reference.rst  |  7 ++++++-
 setup.cfg           |  2 +-
 5 files changed, 25 insertions(+), 42 deletions(-)
 delete mode 100644 .all-contributorsrc

diff --git a/.all-contributorsrc b/.all-contributorsrc
deleted file mode 100644
index 5444d26189..0000000000
--- a/.all-contributorsrc
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "projectName": "hls4ml",
-  "projectOwner": "fastmachinelearning",
-  "repoType": "github",
-  "repoHost": "https://github.com",
-  "files": ["README.md"],
-  "imageSize": 100,
-  "commit": false,
-  "contributorsPerLine": 7,
-  "contributorsSortAlphabetically": true,
-  "badgeTemplate": "[![All Contributors](https://img.shields.io/github/all-contributors/<%= projectOwner %>/<%= projectName %>?color=ee8449&style=flat-square)](#contributors)",
-  "contributorTemplate": "<a href=\"<%= contributor.profile %>\"><img src=\"<%= contributor.avatar_url %>\" width=\"<%= options.imageSize %>px;\" alt=\"\"/><br /><sub><b><%= contributor.name %></b></sub></a>",
-  "linkToUsage": true,
-  "skipCi": true,
-  "contributors": []
-}
diff --git a/README.md b/README.md
index d5f24ca73e..b4c2a65501 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,8 @@
 [![DOI](https://zenodo.org/badge/108329371.svg)](https://zenodo.org/badge/latestdoi/108329371)
 [![PyPI version](https://badge.fury.io/py/hls4ml.svg)](https://badge.fury.io/py/hls4ml)
 [![Supported Python versions](https://img.shields.io/pypi/pyversions/hls4ml.svg)](https://pypi.org/project/hls4ml/)
-[![All Contributors](https://img.shields.io/github/all-contributors/fastmachinelearning/hls4ml?color=ee8449&style=flat-square)](#contributors)
+[![Documentation Status](https://github.com/fastmachinelearning/hls4ml/actions/workflows/build-sphinx.yml/badge.svg)](https://fastmachinelearning.org/hls4ml)
+
 
 A package for machine learning inference in FPGAs. We create firmware implementations of machine learning algorithms using high level synthesis language (HLS). We translate traditional open-source machine learning package models into HLS that can be configured for your use-case!
 
@@ -18,13 +19,13 @@ For more information visit the webpage: [https://fastmachinelearning.org/hls4ml/
 Detailed tutorials on how to use `hls4ml`'s various functionalities can be found [here](https://github.com/hls-fpga-machine-learning/hls4ml-tutorial).
 
 # Installation
-```
+```bash
 pip install hls4ml
 ```
 
 To install the extra dependencies for profiling:
 
-```
+```bash
 pip install hls4ml[profiling]
 ```
 
@@ -33,13 +34,14 @@ pip install hls4ml[profiling]
 ```Python
 import hls4ml
 
-#Fetch a keras model from our example repository
-#This will download our example model to your working directory and return an example configuration file
+# Fetch a keras model from our example repository
+# This will download our example model to your working directory and return an example configuration file
 config = hls4ml.utils.fetch_example_model('KERAS_3layer.json')
 
-print(config) #You can print the configuration to see some default parameters
+# You can print the configuration to see some default parameters
+print(config)
 
-#Convert it to a hls project
+# Convert it to a hls project
 hls_model = hls4ml.converters.keras_to_hls(config)
 
 # Print full list of example models if you want to explore more
@@ -50,11 +52,11 @@ hls4ml.utils.fetch_example_list()
 Note: Vitis HLS is not yet supported. Vivado HLS versions between 2018.2 and 2020.1 are recommended.
 
 ```Python
-#Use Vivado HLS to synthesize the model
-#This might take several minutes
+# Use Vivado HLS to synthesize the model
+# This might take several minutes
 hls_model.build()
 
-#Print out the report if you want
+# Print out the report if you want
 hls4ml.report.read_vivado_report('my-hls-test')
 ```
 
@@ -131,13 +133,3 @@ binary/ternary networks:
     year = "2021"
 }
 ```
-## Contributors
-
-<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
-<!-- prettier-ignore-start -->
-<!-- markdownlint-disable -->
-
-<!-- markdownlint-restore -->
-<!-- prettier-ignore-end -->
-
-<!-- ALL-CONTRIBUTORS-LIST:END -->
diff --git a/docs/conf.py b/docs/conf.py
index 04df6dba96..ab7d6c33bf 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -12,15 +12,17 @@
 #
 import os
 import sys
+
 sys.path.insert(0, os.path.abspath('../'))
 
 import datetime
+
 from setuptools_scm import get_version
 
 # -- Project information -----------------------------------------------------
 
 project = 'hls4ml'
-copyright = str(datetime.datetime.now().year)+', Fast Machine Learning Lab'
+copyright = str(datetime.datetime.now().year) + ', Fast Machine Learning Lab'
 author = 'Fast Machine Learning Lab'
 
 # The full version, including alpha/beta/rc tags
@@ -36,7 +38,8 @@
     'sphinx.ext.autodoc',
     'sphinx.ext.githubpages',
     'sphinx_rtd_theme',
-    'sphinx.ext.napoleon'
+    'sphinx.ext.napoleon',
+    'sphinx_contributors',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -72,12 +75,11 @@
     'display_version': True,
     'prev_next_buttons_location': 'bottom',
     'style_external_links': False,
-
     'style_nav_header_background': '#2980B9',
     # Toc options
     'collapse_navigation': True,
     'sticky_navigation': True,
     'navigation_depth': 2,
     'includehidden': True,
-    'titles_only': False
+    'titles_only': False,
 }
diff --git a/docs/reference.rst b/docs/reference.rst
index dd983081e8..aa4443653a 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -88,4 +88,9 @@ binary/ternary networks:
 Contributors
 ============
 
-Check the README for the full list of contributors!
+Thanks to our contributors!
+
+..  contributors:: fastmachinelearning/hls4ml
+   :avatars:
+   :limit: 100
+   :order: DESC
diff --git a/setup.cfg b/setup.cfg
index bc6dbf643d..9ff049d343 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,7 +8,7 @@ author = hls4ml Team
 license = Apache-2.0
 license_file = LICENSE
 classifiers =
-    Development Status :: 3 - Alpha
+    Development Status :: 4 - Beta
     Intended Audience :: Developers
     Intended Audience :: Science/Research
     License :: OSI Approved :: Apache Software License

From 79750328ef2eb9d9580d346d460317ef94b4b827 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 14:48:19 -0800
Subject: [PATCH 09/12] add sphinx_contributors to requirements

---
 docs/requirements.txt | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index e4295d37eb..8ff43b5e0b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,12 +1,13 @@
-sphinx>=3.2.1
-sphinx_rtd_theme
-toposort>=1.5.0
-numpy
-six
-pyyaml
 h5py
+matplotlib
+numpy
 onnx>=1.4.0
 pandas
+pyyaml
 seaborn
-matplotlib
 setuptools_scm[toml]>=5
+six
+sphinx>=3.2.1
+sphinx_contributors
+sphinx_rtd_theme
+toposort>=1.5.0

From 4cc9b0a7f3cd120580b948a6f13d0446d397d2c9 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 14:58:37 -0800
Subject: [PATCH 10/12] update

---
 README.md      | 1 -
 docs/flows.rst | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b4c2a65501..ff815f7cbc 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,6 @@
 [![Supported Python versions](https://img.shields.io/pypi/pyversions/hls4ml.svg)](https://pypi.org/project/hls4ml/)
 [![Documentation Status](https://github.com/fastmachinelearning/hls4ml/actions/workflows/build-sphinx.yml/badge.svg)](https://fastmachinelearning.org/hls4ml)
 
-
 A package for machine learning inference in FPGAs. We create firmware implementations of machine learning algorithms using high level synthesis language (HLS). We translate traditional open-source machine learning package models into HLS that can be configured for your use-case!
 
 If you have any questions, comments, or ideas regarding hls4ml or just want to show us how you use hls4ml, don't hesitate to reach us through the [discussions](https://github.com/fastmachinelearning/hls4ml/discussions) tab.
diff --git a/docs/flows.rst b/docs/flows.rst
index 1336464355..28d423aa8e 100644
--- a/docs/flows.rst
+++ b/docs/flows.rst
@@ -1,6 +1,6 @@
-========
+====================
 Flows and Optimizers
-========
+====================
 
 - Explain concept of flows and optimizers
 - Describe FIFO buffer optimizer as an example?

From 1daa4aec6eef41d3f297581263cacc7efab64af6 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 17:52:32 -0800
Subject: [PATCH 11/12] update

---
 docs/extension.rst             | 180 ++++++++++++++++++++++++++++++++-
 test/pytest/test_extensions.py |   2 +-
 2 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/docs/extension.rst b/docs/extension.rst
index 71936bf187..8c534702a7 100644
--- a/docs/extension.rst
+++ b/docs/extension.rst
@@ -2,4 +2,182 @@
 Extension API
 ========================
 
-- Describe extension API by walking through ``test_extensions.py`` @jmduarte
+hls4ml natively supports a large number of neural network layers.
+But what if a desired layer is not supported?
+If it is standard enough and its implementation would benefit the community as a whole, we would welcome a contribution to add it to the standard set of supported layers.
+However, if it is a somewhat niche custom layer, there is another approach we can take to extend hls4ml through the *extension API*.
+
+This documentation will walk through a complete `complete end-to-end example <https://github.com/fastmachinelearning/hls4ml/blob/main/test/pytest/test_extensions.py>`_, which is part of our testing suite.
+To implement a custom layer in hls4ml with the extension API, the required components are:
+
+* Your custom layer class
+* Equivalent hls4ml custom layer class
+* Parser for the converter
+* HLS implementation
+* Layer config template
+* Function config template
+* Registration of layer, source code, and templates
+
+For concreteness, let's say our custom layer, implemented in Keras, reverses the order of the last dimension of the inputs.
+
+.. code-block:: Python
+
+    # Keras implementation of a custom layer
+    class KReverse(tf.keras.layers.Layer):
+        '''Keras implementation of a hypothetical custom layer'''
+
+        def __init__(self):
+            super().__init__()
+
+        def call(self, inputs):
+            return tf.reverse(inputs, axis=[-1])
+
+Now, we can define the equivalent layer in hls4ml, which inherits from ``hls4ml.model.layers.Layer``.
+
+.. code-block:: Python
+
+    # hls4ml layer implementation
+    class HReverse(hls4ml.model.layers.Layer):
+        '''hls4ml implementation of a hypothetical custom layer'''
+
+        def initialize(self):
+            inp = self.get_input_variable()
+            shape = inp.shape
+            dims = inp.dim_names
+            self.add_output_variable(shape, dims)
+
+A parser for the converter is also required.
+This parser reads the attributes of the Keras layer instance and populates a dictionary of attributes for the hls4ml layer.
+It also returns a list of output shapes (one for each output.
+In this case, there is only a single output with the same shape as the input.
+
+.. code-block:: Python
+
+    # Parser for converter
+    def parse_reverse_layer(keras_layer, input_names, input_shapes, data_reader):
+        layer = {}
+        layer['class_name'] = 'HReverse'
+        layer['name'] = keras_layer['config']['name']
+        layer['n_in'] = input_shapes[0][1]
+
+        if input_names is not None:
+            layer['inputs'] = input_names
+
+        return layer, [shape for shape in input_shapes[0]]
+
+
+Next, we need the actual HLS implementaton of the function, which can be written in a header file ``nnet_reverse.h``.
+
+.. code-block:: C++
+
+    #ifndef NNET_REVERSE_H_
+    #define NNET_REVERSE_H_
+
+    #include "nnet_common.h"
+
+    namespace nnet {
+
+    struct reverse_config {
+        static const unsigned n_in = 10;
+    };
+
+    template<class data_T, typename CONFIG_T>
+    void reverse(
+        data_T input[CONFIG_T::n_in],
+        data_T reversed[CONFIG_T::n_in]
+    ) {
+        for (int i = 0; i < CONFIG_T::n_in; i++) {
+            reversed[CONFIG_T::n_in - 1 - i] = input[i];
+        }
+    }
+
+    }
+
+    #endif
+
+Next, we can define the layer config and function call templates.
+These two templates determine how to populate the config template based on the layer attributes and the function call signature for the layer in HLS, respectively.
+
+.. code-block:: Python
+
+    rev_config_template = """struct config{index} : nnet::reverse_config {{
+        static const unsigned n_in = {n_in};
+    }};\n"""
+
+    rev_function_template = 'nnet::reverse<{input_t}, {config}>({input}, {output});'
+    rev_include_list = ['nnet_utils/nnet_reverse.h']
+
+
+    class HReverseConfigTemplate(hls4ml.backends.template.LayerConfigTemplate):
+        def __init__(self):
+            super().__init__(HReverse)
+            self.template = rev_config_template
+
+        def format(self, node):
+            params = self._default_config_params(node)
+            return self.template.format(**params)
+
+
+    class HReverseFunctionTemplate(hls4ml.backends.template.FunctionCallTemplate):
+        def __init__(self):
+            super().__init__(HReverse, include_header=rev_include_list)
+            self.template = rev_function_template
+
+        def format(self, node):
+            params = self._default_function_params(node)
+            return self.template.format(**params)
+
+Now, we need to tell hls4ml about the existence of this new layer by registering it.
+We also need to register the parser (a.k.a. the layer handler), the template passes, and source with the particular backend.
+In this case, the HLS code is valid for both the Vivado and Quartus backends.
+
+.. code-block:: Python
+
+    # Register the converter for custom Keras layer
+    hls4ml.converters.register_keras_layer_handler('KReverse', parse_reverse_layer)
+
+    # Register the hls4ml's IR layer
+    hls4ml.model.layers.register_layer('HReverse', HReverse)
+
+    for backend_id in ['Vivado', 'Quartus']:
+        # Register the optimization passes (if any)
+        backend = hls4ml.backends.get_backend(backend_id)
+        backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize')
+
+        # Register template passes for the given backend
+        backend.register_template(HReverseConfigTemplate)
+        backend.register_template(HReverseFunctionTemplate)
+
+        # Register HLS implementation
+        backend.register_source('nnet_reverse.h')
+
+Finally, we can actually test the hls4ml custom layer compared to the Keras one.
+
+.. code-block:: Python
+
+    # Test if it works
+    kmodel = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Input(shape=(8,)),
+            KReverse(),
+            tf.keras.layers.ReLU(),
+        ]
+    )
+
+    x = np.random.randint(-5, 5, (8,), dtype='int32')
+    kres = kmodel(x)
+
+    for backend_id in ['Vivado', 'Quartus']:
+
+        hmodel = hls4ml.converters.convert_from_keras_model(
+            kmodel,
+            output_dir=str(f'hls4mlprj_extensions_{backend_id}'),
+            backend=backend_id,
+            io_type='io_parallel',
+            hls_config={'Model': {'Precision': 'ap_int<6>', 'ReuseFactor': 1}},
+        )
+
+        hmodel.compile()
+        hres = hmodel.predict(x.astype('float32'))
+
+        np.testing.assert_array_equal(kres, hres)
diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py
index 1c8e07198a..e97a58d1f7 100644
--- a/test/pytest/test_extensions.py
+++ b/test/pytest/test_extensions.py
@@ -118,7 +118,7 @@ def format(self, node):
 
 
 @pytest.fixture(scope='session', autouse=True)
-def regsister_custom_layer():
+def register_custom_layer():
     # Register the converter for custom Keras layer
     hls4ml.converters.register_keras_layer_handler('KReverse', parse_reverse_layer)
 

From bc02a8364b9e41ac017a0c4919891bdca771bf64 Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Sat, 11 Feb 2023 18:02:13 -0800
Subject: [PATCH 12/12] update

---
 docs/extension.rst | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/extension.rst b/docs/extension.rst
index 8c534702a7..2836430ea8 100644
--- a/docs/extension.rst
+++ b/docs/extension.rst
@@ -18,7 +18,10 @@ To implement a custom layer in hls4ml with the extension API, the required compo
 * Function config template
 * Registration of layer, source code, and templates
 
-For concreteness, let's say our custom layer, implemented in Keras, reverses the order of the last dimension of the inputs.
+Complete example
+================
+
+For concreteness, let's say our custom layer ``KReverse`` is implemented in Keras and reverses the order of the last dimension of the input.
 
 .. code-block:: Python
 
@@ -32,7 +35,7 @@ For concreteness, let's say our custom layer, implemented in Keras, reverses the
         def call(self, inputs):
             return tf.reverse(inputs, axis=[-1])
 
-Now, we can define the equivalent layer in hls4ml, which inherits from ``hls4ml.model.layers.Layer``.
+We can define the equivalent layer in hls4ml ``HReverse``, which inherits from ``hls4ml.model.layers.Layer``.
 
 .. code-block:: Python
 
@@ -46,10 +49,10 @@ Now, we can define the equivalent layer in hls4ml, which inherits from ``hls4ml.
             dims = inp.dim_names
             self.add_output_variable(shape, dims)
 
-A parser for the converter is also required.
+A parser for the Keras to HLS converter is also required.
 This parser reads the attributes of the Keras layer instance and populates a dictionary of attributes for the hls4ml layer.
-It also returns a list of output shapes (one for each output.
-In this case, there is only a single output with the same shape as the input.
+It also returns a list of output shapes (one sjape for each output).
+In this case, there a single output with the same shape as the input.
 
 .. code-block:: Python
 
@@ -65,7 +68,6 @@ In this case, there is only a single output with the same shape as the input.
 
         return layer, [shape for shape in input_shapes[0]]
 
-
 Next, we need the actual HLS implementaton of the function, which can be written in a header file ``nnet_reverse.h``.
 
 .. code-block:: C++
@@ -95,7 +97,7 @@ Next, we need the actual HLS implementaton of the function, which can be written
 
     #endif
 
-Next, we can define the layer config and function call templates.
+Now, we can define the layer config and function call templates.
 These two templates determine how to populate the config template based on the layer attributes and the function call signature for the layer in HLS, respectively.
 
 .. code-block:: Python
@@ -128,7 +130,7 @@ These two templates determine how to populate the config template based on the l
             return self.template.format(**params)
 
 Now, we need to tell hls4ml about the existence of this new layer by registering it.
-We also need to register the parser (a.k.a. the layer handler), the template passes, and source with the particular backend.
+We also need to register the parser (a.k.a. the layer handler), the template passes, and HLS implementation source code with the particular backend.
 In this case, the HLS code is valid for both the Vivado and Quartus backends.
 
 .. code-block:: Python