fastmachinelearning
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-sphinx.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-sphinx.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎hls4ml/backends/fpga/fpga_backend.py‎
Lines changed: 238 additions & 0 deletions b/‎hls4ml/backends/fpga/fpga_backend.py‎
Lines changed: 238 additions & 0 deletions
diff --git a/‎hls4ml/backends/fpga/fpga_layers.py‎
Lines changed: 14 additions & 6 deletions b/‎hls4ml/backends/fpga/fpga_layers.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎hls4ml/backends/fpga/passes/codegen.py‎
Lines changed: 45 additions & 0 deletions b/‎hls4ml/backends/fpga/passes/codegen.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎hls4ml/backends/fpga/passes/final_reshape.py‎
Lines changed: 21 additions & 0 deletions b/‎hls4ml/backends/fpga/passes/final_reshape.py‎
Lines changed: 21 additions & 0 deletions
@@ -30,7 +30,7 @@ Note: Please delete options that are not relevant.
 
 ## Checklist
 
-- [ ] I have read the [guidelines for contributing](https://github.com/fastmachinelearning/hls4ml/blob/master/CONTRIBUTING.md).
+- [ ] I have read the [guidelines for contributing](https://github.com/fastmachinelearning/hls4ml/blob/main/CONTRIBUTING.md).
 - [ ] I have commented my code, particularly in hard-to-understand areas.
 - [ ] I have made corresponding changes to the documentation.
 - [ ] My changes generate no new warnings.
 
@@ -2,7 +2,7 @@ name: build-sphinx
 on:
   push:
     branches:    
-      - master
+      - main
 
 jobs:
   build:
@@ -30,4 +30,4 @@ jobs:
       with:
         branch: gh-pages
         directory: gh-pages
-        github_token: ${{ secrets.PERSONAL_TOKEN }}
+        github_token: ${{ secrets.PERSONAL_TOKEN }}
@@ -181,6 +181,27 @@ def set_target_reuse_factor(self, layer):
 
             layer.set_attr('reuse_factor', float(rf) / kernel_multiplies)
 
+    def get_valid_conv_partition_splits(self, out_height, out_width):
+        """Generate valid partition splits of a Conv1D/2D layer.
+        
+        Essentially a list of divisors of the number of pixels of the output image.
+
+        Args:
+            out_height (int): The height of the output image
+            out_width (int): The width of the output image
+
+        Returns:
+            list: List of valid partition splits
+        """        
+        n_pixels = out_height * out_width
+        valid_n_partitions = []
+        for i in range(1, int(n_pixels / 2) + 1):
+            if n_pixels % i == 0:
+                valid_n_partitions.append(i)
+        valid_n_partitions.append(n_pixels)
+
+        return valid_n_partitions
+
     @classmethod
     def convert_precision_string(cls, precision):
         if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType):
@@ -384,6 +405,223 @@ def compute_conv2d_instructions(self, in_H, in_W, in_C, kernel_size=3, stride=1,
 
         return (min_H, min_W, windows_int)
 
+    def _compute_conv1d_im2col(self, input_shape, kernel=3, stride=1, pad=(0,0), dilation=1):
+        W, C = input_shape
+        pad_l, pad_r = pad
+
+        out_w = (W + pad_l + pad_r - (dilation * (kernel - 1) + 1)) // stride + 1
+
+        input_img = np.arange(1, W * C + 1)
+        im_matrix = np.zeros((kernel * C * out_w, ))
+
+        index = 0
+        for i_ow in range(out_w):
+            for i_kw in range(kernel):
+                for i_c in range(C):
+                    input_col = -pad_l + i_kw * dilation + i_ow * stride
+                    if (input_col >= 0 and input_col < W):
+                        im_matrix[index] = input_img[input_col * C + i_c]
+                    else:
+                        im_matrix[index] = 0
+                    index += 1
+        
+        im_matrix = im_matrix.reshape(out_w, -1)
+        return im_matrix
+
+
+    def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, kernel=3, stride=1, pad=0, dilation=1):
+        """Generate a C++ function that mimics the im2col algorithm. This function works for 1D convolution.
+
+        The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
+        to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
+        the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
+        we need to do this for every convolution layer. 
+
+        Args:
+            layer_idx (int): Index of layer ('index' attribute).
+            n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
+            in_W (int): Width of input.
+            in_C (int): Number of channels.
+            kernel (int, optional): Size of the kernel. Defaults to 3.
+            stride (int, optional): Stride length. Defaults to 1.
+            pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [left_pad, right_pad]. Defaults to 0.
+            dilation (int, optional): Dilation rate. Defaults to 1.
+
+        Returns:
+            str: Generated C++ function
+        """        
+        if isinstance(pad, Iterable):
+            pad_left = pad[0]
+            pad_right = pad[1]
+        else:
+            pad_left = pad
+            pad_right = pad
+
+        im2col_matrix = self._compute_conv1d_im2col(
+            (in_W, in_C),
+            kernel,
+            stride,
+            (pad_left, pad_right),
+            dilation
+        )
+
+        generated_code = (
+            "template<class data_T, typename CONFIG_T>\n"
+            "class fill_buffer_{index} : public FillConv1DBuffer<data_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void fill_buffer(\n"
+            "        data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
+            "        const unsigned partition\n"
+            "    ) {{\n"
+        ).format(index=layer_idx)
+        indent = '    '
+
+        for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
+            generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
+            for pixel_idx, arr in enumerate(partition):
+                buffer_stmts = []
+                for j, v in enumerate(arr):
+                    if v == 0:
+                        val = '0'
+                    else:
+                        val = 'data[{}]'.format(int(v-1))
+                    buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
+                generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
+            generated_code += '\n' + indent * 2 + '}\n'
+
+        generated_code += indent + '}\n'
+        generated_code += '};\n'
+
+        return generated_code
+
+    def _compute_conv2d_im2col(self, input_shape, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1,1)):
+        H, W, C = input_shape
+        kernel_h, kernel_w = kernel
+        stride_h, stride_w = stride
+        pad_t, pad_b, pad_l, pad_r = pad
+        dilation_h, dilation_w = dilation
+
+        out_h = (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
+        out_w = (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
+
+        input_img = np.arange(1, H * W * C + 1)
+        im_matrix = np.zeros((kernel_h * kernel_w * C * out_h * out_w, ))
+
+        index = 0
+        for i_oh in range(out_h):
+            for i_ow in range(out_w):
+                for i_kh in range(kernel_h):
+                    input_row = -pad_t + i_kh * dilation_h + i_oh * stride_h
+                    for i_kw in range(kernel_w):
+                        for i_c in range(C):
+                            if (input_row < 0 or input_row >= H):
+                                im_matrix[index] = 0
+                            else:
+                                input_col = -pad_l + i_kw * dilation_w + i_ow * stride_w
+                                if (input_col >= 0 and input_col < W):
+                                    im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
+                                else:
+                                    im_matrix[index] = 0
+                            index += 1
+        
+        im_matrix = im_matrix.reshape(out_h * out_w, -1)
+        return im_matrix
+
+
+    def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1, 1)):
+        """Generate a C++ function that mimics the im2col algorithm. This function works for 2D convolution.
+
+        The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
+        to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
+        the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
+        we need to do this for every convolution layer. 
+
+        Args:
+            layer_idx (int): Index of layer ('index' attribute).
+            n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
+            in_H (int): Height of input.
+            in_W (int): Width of input.
+            in_C (int): Number of channels.
+            kernel (int or Iterable, optional): Size of the kernel. Defaults to (3,3).
+            stride (int or Iterable, optional): Stride length. Defaults to (1,1).
+            pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [top_pad, bottom_pad, left_pad, right_pad]. Defaults to 0.
+            dilation (int or Iterable, optional): Dilation rate. Defaults to (1,1).
+
+        Returns:
+            str: Generated C++ function
+        """  
+        
+        if isinstance(kernel, Iterable):
+            kernel_height = kernel[0]
+            kernel_width = kernel[1]
+        else:
+            kernel_height = kernel
+            kernel_width = kernel
+
+        if isinstance(stride, Iterable):
+            stride_height = stride[0]
+            stride_width = stride[1]
+        else:
+            stride_height = stride
+            stride_width = stride
+
+        if isinstance(pad, Iterable):
+            pad_top = pad[0]
+            pad_bottom = pad[1]
+            pad_left = pad[2]
+            pad_right = pad[3]
+        else:
+            pad_top = pad
+            pad_bottom = pad
+            pad_left = pad
+            pad_right = pad
+
+        if isinstance(dilation, Iterable):
+            dilation_height = dilation[0]
+            dilation_width = dilation[1]
+        else:
+            dilation_height = dilation
+            dilation_width = dilation
+
+        im2col_matrix = self._compute_conv2d_im2col(
+            (in_H, in_W, in_C),
+            (kernel_height, kernel_width),
+            (stride_height, stride_width),
+            (pad_top, pad_bottom, pad_left, pad_right),
+            (dilation_height, dilation_width)
+        )
+
+        generated_code = (
+            "template<class data_T, typename CONFIG_T>\n"
+            "class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
+            "    public:\n"
+            "    static void fill_buffer(\n"
+            "        data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
+            "        data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
+            "        const unsigned partition\n"
+            "    ) {{\n"
+        ).format(index=layer_idx)
+        indent = '    '
+
+        for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
+            generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
+            for pixel_idx, arr in enumerate(partition):
+                buffer_stmts = []
+                for j, v in enumerate(arr):
+                    if v == 0:
+                        val = '0'
+                    else:
+                        val = 'data[{}]'.format(int(v-1))
+                    buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
+                generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
+            generated_code += '\n' + indent * 2 + '}\n'
+
+        generated_code += indent + '}\n'
+        generated_code += '};\n'
+
+        return generated_code
+
     @model_optimizer()
     def write_hls(self, model):
         self.writer.write_hls(model)
 
@@ -1,10 +1,6 @@
 import numpy as np
-import re
-
-from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
-from hls4ml.model.layers import Layer, Activation, Dense, BatchNormalization, register_layer
-from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.types import IntegerPrecisionType, XnorPrecisionType
+from hls4ml.model.layers import Layer, Conv1D, Conv2D
 
 class BatchNormalizationQuantizedTanh(Layer):
     ''' Merged Batch Normalization and quantized (binary or ternary) Tanh layer.
@@ -42,3 +38,15 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
             threshold_lo = np.floor(threshold_lo * 2**F) / 2**F
             self.add_weights_variable(name='threshold_hi', var_name='th{index}', data=threshold_hi, type_name='threshold_hi_{index}_t', precision=inp.type.precision)
             self.add_weights_variable(name='threshold_lo', var_name='tl{index}', data=threshold_lo, type_name='threshold_lo_{index}_t', precision=inp.type.precision)
+
+class PointwiseConv1D(Conv1D):
+    ''' Optimized Conv1D implementation for 1x1 kernels. '''
+
+    # Nothing to do, will pick up function and config from class name
+    pass
+
+class PointwiseConv2D(Conv2D):
+    ''' Optimized Conv2D implementation for 1x1 kernels. '''
+
+    # Nothing to do, will pick up function and config from class name
+    pass
@@ -0,0 +1,45 @@
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.types import Source
+
+class GenerateConvIm2col(OptimizerPass):
+    ''' Generates tcode for im2col step of 1D/2d convolution '''
+    def match(self, node):
+        return isinstance(node, (Conv1D, Conv2D)) and \
+            node.model.config.get_config_value('IOType') == 'io_parallel'
+    
+    def transform(self, model, node):
+        node_class = node.__class__.__name__
+        if '1D' in node_class:
+            self._generate_im2col_1d(node)
+        elif '2D' in node_class:
+            self._generate_im2col_2d(node)
+        else:
+            raise Exception('Cannot generate instructions for node {} ({})'.format(node.name, node_class))
+    
+    def _generate_im2col_1d(self, node):
+        code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
+            node.get_attr('index'),
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            kernel=node.get_attr('filt_width'),
+            stride=node.get_attr('stride_width'),
+            pad=(node.get_attr('pad_left'), node.get_attr('pad_right'))
+        )
+
+        node.set_attr('line_buffer_codegen', Source(code_str))
+
+    def _generate_im2col_2d(self, node):
+        code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
+            node.get_attr('index'),
+            node.get_attr('n_partitions'),
+            node.get_input_variable().shape[0],
+            node.get_input_variable().shape[1],
+            node.get_input_variable().shape[2],
+            kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
+            stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
+            pad=(node.get_attr('pad_top'), node.get_attr('pad_bottom'), node.get_attr('pad_left'), node.get_attr('pad_right'))
+        )
+        
+        node.set_attr('line_buffer_codegen', Source(code_str))
@@ -0,0 +1,21 @@
+import numpy as np
+
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.layers import Reshape
+
+
+class RemoveFinalReshape(OptimizerPass):
+    ''' Remove reshape if final layer '''
+    def match(self, node):
+        # match if reshape is final node
+        return isinstance(node, Reshape) and not node.get_output_nodes()
+
+    def transform(self, model, node):
+        if model.config.get_config_value('IOType') == 'io_parallel':
+            print('WARNING: Final layer is a Reshape, which does not affect the output for io_parallel; removing it')
+            # remove, but don't rewire because it's the output layer
+            model.remove_node(node, rewire=False) 
+            return True
+        elif model.config.get_config_value('IOType') == 'io_stream':
+            print('WARNING: Final layer is a Reshape, which may incur a large resource cost for io_stream; consider removing it')
+        return False