Skip to content

Commit 2df478d

Browse files
committed
Merge remote-tracking branch 'upstream/main' into argmax
2 parents d3f5858 + 0d1bc8b commit 2df478d

File tree

80 files changed

+5727
-1691
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+5727
-1691
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Note: Please delete options that are not relevant.
3030

3131
## Checklist
3232

33-
- [ ] I have read the [guidelines for contributing](https://github.com/fastmachinelearning/hls4ml/blob/master/CONTRIBUTING.md).
33+
- [ ] I have read the [guidelines for contributing](https://github.com/fastmachinelearning/hls4ml/blob/main/CONTRIBUTING.md).
3434
- [ ] I have commented my code, particularly in hard-to-understand areas.
3535
- [ ] I have made corresponding changes to the documentation.
3636
- [ ] My changes generate no new warnings.

.github/workflows/build-sphinx.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: build-sphinx
22
on:
33
push:
44
branches:
5-
- master
5+
- main
66

77
jobs:
88
build:
@@ -30,4 +30,4 @@ jobs:
3030
with:
3131
branch: gh-pages
3232
directory: gh-pages
33-
github_token: ${{ secrets.PERSONAL_TOKEN }}
33+
github_token: ${{ secrets.PERSONAL_TOKEN }}

hls4ml/backends/fpga/fpga_backend.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,27 @@ def set_target_reuse_factor(self, layer):
181181

182182
layer.set_attr('reuse_factor', float(rf) / kernel_multiplies)
183183

184+
def get_valid_conv_partition_splits(self, out_height, out_width):
185+
"""Generate valid partition splits of a Conv1D/2D layer.
186+
187+
Essentially a list of divisors of the number of pixels of the output image.
188+
189+
Args:
190+
out_height (int): The height of the output image
191+
out_width (int): The width of the output image
192+
193+
Returns:
194+
list: List of valid partition splits
195+
"""
196+
n_pixels = out_height * out_width
197+
valid_n_partitions = []
198+
for i in range(1, int(n_pixels / 2) + 1):
199+
if n_pixels % i == 0:
200+
valid_n_partitions.append(i)
201+
valid_n_partitions.append(n_pixels)
202+
203+
return valid_n_partitions
204+
184205
@classmethod
185206
def convert_precision_string(cls, precision):
186207
if isinstance(precision, IntegerPrecisionType) or isinstance(precision, FixedPrecisionType):
@@ -384,6 +405,223 @@ def compute_conv2d_instructions(self, in_H, in_W, in_C, kernel_size=3, stride=1,
384405

385406
return (min_H, min_W, windows_int)
386407

408+
def _compute_conv1d_im2col(self, input_shape, kernel=3, stride=1, pad=(0,0), dilation=1):
409+
W, C = input_shape
410+
pad_l, pad_r = pad
411+
412+
out_w = (W + pad_l + pad_r - (dilation * (kernel - 1) + 1)) // stride + 1
413+
414+
input_img = np.arange(1, W * C + 1)
415+
im_matrix = np.zeros((kernel * C * out_w, ))
416+
417+
index = 0
418+
for i_ow in range(out_w):
419+
for i_kw in range(kernel):
420+
for i_c in range(C):
421+
input_col = -pad_l + i_kw * dilation + i_ow * stride
422+
if (input_col >= 0 and input_col < W):
423+
im_matrix[index] = input_img[input_col * C + i_c]
424+
else:
425+
im_matrix[index] = 0
426+
index += 1
427+
428+
im_matrix = im_matrix.reshape(out_w, -1)
429+
return im_matrix
430+
431+
432+
def generate_conv1d_line_buffer_fn(self, layer_idx, n_partitions, in_W, in_C, kernel=3, stride=1, pad=0, dilation=1):
433+
"""Generate a C++ function that mimics the im2col algorithm. This function works for 1D convolution.
434+
435+
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
436+
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
437+
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
438+
we need to do this for every convolution layer.
439+
440+
Args:
441+
layer_idx (int): Index of layer ('index' attribute).
442+
n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
443+
in_W (int): Width of input.
444+
in_C (int): Number of channels.
445+
kernel (int, optional): Size of the kernel. Defaults to 3.
446+
stride (int, optional): Stride length. Defaults to 1.
447+
pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [left_pad, right_pad]. Defaults to 0.
448+
dilation (int, optional): Dilation rate. Defaults to 1.
449+
450+
Returns:
451+
str: Generated C++ function
452+
"""
453+
if isinstance(pad, Iterable):
454+
pad_left = pad[0]
455+
pad_right = pad[1]
456+
else:
457+
pad_left = pad
458+
pad_right = pad
459+
460+
im2col_matrix = self._compute_conv1d_im2col(
461+
(in_W, in_C),
462+
kernel,
463+
stride,
464+
(pad_left, pad_right),
465+
dilation
466+
)
467+
468+
generated_code = (
469+
"template<class data_T, typename CONFIG_T>\n"
470+
"class fill_buffer_{index} : public FillConv1DBuffer<data_T, CONFIG_T> {{\n"
471+
" public:\n"
472+
" static void fill_buffer(\n"
473+
" data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n"
474+
" data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
475+
" const unsigned partition\n"
476+
" ) {{\n"
477+
).format(index=layer_idx)
478+
indent = ' '
479+
480+
for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
481+
generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
482+
for pixel_idx, arr in enumerate(partition):
483+
buffer_stmts = []
484+
for j, v in enumerate(arr):
485+
if v == 0:
486+
val = '0'
487+
else:
488+
val = 'data[{}]'.format(int(v-1))
489+
buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
490+
generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
491+
generated_code += '\n' + indent * 2 + '}\n'
492+
493+
generated_code += indent + '}\n'
494+
generated_code += '};\n'
495+
496+
return generated_code
497+
498+
def _compute_conv2d_im2col(self, input_shape, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1,1)):
499+
H, W, C = input_shape
500+
kernel_h, kernel_w = kernel
501+
stride_h, stride_w = stride
502+
pad_t, pad_b, pad_l, pad_r = pad
503+
dilation_h, dilation_w = dilation
504+
505+
out_h = (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1
506+
out_w = (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1
507+
508+
input_img = np.arange(1, H * W * C + 1)
509+
im_matrix = np.zeros((kernel_h * kernel_w * C * out_h * out_w, ))
510+
511+
index = 0
512+
for i_oh in range(out_h):
513+
for i_ow in range(out_w):
514+
for i_kh in range(kernel_h):
515+
input_row = -pad_t + i_kh * dilation_h + i_oh * stride_h
516+
for i_kw in range(kernel_w):
517+
for i_c in range(C):
518+
if (input_row < 0 or input_row >= H):
519+
im_matrix[index] = 0
520+
else:
521+
input_col = -pad_l + i_kw * dilation_w + i_ow * stride_w
522+
if (input_col >= 0 and input_col < W):
523+
im_matrix[index] = input_img[input_row * W * C + input_col * C + i_c]
524+
else:
525+
im_matrix[index] = 0
526+
index += 1
527+
528+
im_matrix = im_matrix.reshape(out_h * out_w, -1)
529+
return im_matrix
530+
531+
532+
def generate_conv2d_line_buffer_fn(self, layer_idx, n_partitions, in_H, in_W, in_C, kernel=(3, 3), stride=(1, 1), pad=(0, 0, 0, 0), dilation=(1, 1)):
533+
"""Generate a C++ function that mimics the im2col algorithm. This function works for 2D convolution.
534+
535+
The HLS compiler produces suboptimal designs for a im2col algorithm implementation, so a trick we use is
536+
to generate a resulting a result of im2col transformation explicitly, instead of relying on loops. Since
537+
the result depends on the paraleters of the convolution layer (the input size, the kernel size, stride etc),
538+
we need to do this for every convolution layer.
539+
540+
Args:
541+
layer_idx (int): Index of layer ('index' attribute).
542+
n_partitions (int): Number of partitions to divide the input into. The pixels in each partition will be processed in parallel.
543+
in_H (int): Height of input.
544+
in_W (int): Width of input.
545+
in_C (int): Number of channels.
546+
kernel (int or Iterable, optional): Size of the kernel. Defaults to (3,3).
547+
stride (int or Iterable, optional): Stride length. Defaults to (1,1).
548+
pad (int or Iterable, optional): Padding to apply. Specified as either a number or a list [top_pad, bottom_pad, left_pad, right_pad]. Defaults to 0.
549+
dilation (int or Iterable, optional): Dilation rate. Defaults to (1,1).
550+
551+
Returns:
552+
str: Generated C++ function
553+
"""
554+
555+
if isinstance(kernel, Iterable):
556+
kernel_height = kernel[0]
557+
kernel_width = kernel[1]
558+
else:
559+
kernel_height = kernel
560+
kernel_width = kernel
561+
562+
if isinstance(stride, Iterable):
563+
stride_height = stride[0]
564+
stride_width = stride[1]
565+
else:
566+
stride_height = stride
567+
stride_width = stride
568+
569+
if isinstance(pad, Iterable):
570+
pad_top = pad[0]
571+
pad_bottom = pad[1]
572+
pad_left = pad[2]
573+
pad_right = pad[3]
574+
else:
575+
pad_top = pad
576+
pad_bottom = pad
577+
pad_left = pad
578+
pad_right = pad
579+
580+
if isinstance(dilation, Iterable):
581+
dilation_height = dilation[0]
582+
dilation_width = dilation[1]
583+
else:
584+
dilation_height = dilation
585+
dilation_width = dilation
586+
587+
im2col_matrix = self._compute_conv2d_im2col(
588+
(in_H, in_W, in_C),
589+
(kernel_height, kernel_width),
590+
(stride_height, stride_width),
591+
(pad_top, pad_bottom, pad_left, pad_right),
592+
(dilation_height, dilation_width)
593+
)
594+
595+
generated_code = (
596+
"template<class data_T, typename CONFIG_T>\n"
597+
"class fill_buffer_{index} : public FillConv2DBuffer<data_T, CONFIG_T> {{\n"
598+
" public:\n"
599+
" static void fill_buffer(\n"
600+
" data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],\n"
601+
" data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],\n"
602+
" const unsigned partition\n"
603+
" ) {{\n"
604+
).format(index=layer_idx)
605+
indent = ' '
606+
607+
for partition_idx, partition in enumerate(np.split(im2col_matrix, n_partitions)):
608+
generated_code += indent * 2 + 'if (partition == {:>3}) {{\n'.format(partition_idx)
609+
for pixel_idx, arr in enumerate(partition):
610+
buffer_stmts = []
611+
for j, v in enumerate(arr):
612+
if v == 0:
613+
val = '0'
614+
else:
615+
val = 'data[{}]'.format(int(v-1))
616+
buffer_stmts.append('buffer[{}][{}] = {:>10};'.format(pixel_idx, j, val))
617+
generated_code += indent * 3 + ' '.join(buffer_stmts) + '\n'
618+
generated_code += '\n' + indent * 2 + '}\n'
619+
620+
generated_code += indent + '}\n'
621+
generated_code += '};\n'
622+
623+
return generated_code
624+
387625
@model_optimizer()
388626
def write_hls(self, model):
389627
self.writer.write_hls(model)

hls4ml/backends/fpga/fpga_layers.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import numpy as np
2-
import re
3-
4-
from hls4ml.model.optimizer import OptimizerPass
5-
from hls4ml.model.types import IntegerPrecisionType, NamedType, XnorPrecisionType
6-
from hls4ml.model.layers import Layer, Activation, Dense, BatchNormalization, register_layer
7-
from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
2+
from hls4ml.model.types import IntegerPrecisionType, XnorPrecisionType
3+
from hls4ml.model.layers import Layer, Conv1D, Conv2D
84

95
class BatchNormalizationQuantizedTanh(Layer):
106
''' Merged Batch Normalization and quantized (binary or ternary) Tanh layer.
@@ -42,3 +38,15 @@ def set_thresholds(self, scale, bias, ternary_threshold=0.5):
4238
threshold_lo = np.floor(threshold_lo * 2**F) / 2**F
4339
self.add_weights_variable(name='threshold_hi', var_name='th{index}', data=threshold_hi, type_name='threshold_hi_{index}_t', precision=inp.type.precision)
4440
self.add_weights_variable(name='threshold_lo', var_name='tl{index}', data=threshold_lo, type_name='threshold_lo_{index}_t', precision=inp.type.precision)
41+
42+
class PointwiseConv1D(Conv1D):
43+
''' Optimized Conv1D implementation for 1x1 kernels. '''
44+
45+
# Nothing to do, will pick up function and config from class name
46+
pass
47+
48+
class PointwiseConv2D(Conv2D):
49+
''' Optimized Conv2D implementation for 1x1 kernels. '''
50+
51+
# Nothing to do, will pick up function and config from class name
52+
pass
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from hls4ml.model.optimizer import OptimizerPass
2+
from hls4ml.model.layers import Conv1D, Conv2D
3+
from hls4ml.model.types import Source
4+
5+
class GenerateConvIm2col(OptimizerPass):
6+
''' Generates tcode for im2col step of 1D/2d convolution '''
7+
def match(self, node):
8+
return isinstance(node, (Conv1D, Conv2D)) and \
9+
node.model.config.get_config_value('IOType') == 'io_parallel'
10+
11+
def transform(self, model, node):
12+
node_class = node.__class__.__name__
13+
if '1D' in node_class:
14+
self._generate_im2col_1d(node)
15+
elif '2D' in node_class:
16+
self._generate_im2col_2d(node)
17+
else:
18+
raise Exception('Cannot generate instructions for node {} ({})'.format(node.name, node_class))
19+
20+
def _generate_im2col_1d(self, node):
21+
code_str = node.model.config.backend.generate_conv1d_line_buffer_fn(
22+
node.get_attr('index'),
23+
node.get_attr('n_partitions'),
24+
node.get_input_variable().shape[0],
25+
node.get_input_variable().shape[1],
26+
kernel=node.get_attr('filt_width'),
27+
stride=node.get_attr('stride_width'),
28+
pad=(node.get_attr('pad_left'), node.get_attr('pad_right'))
29+
)
30+
31+
node.set_attr('line_buffer_codegen', Source(code_str))
32+
33+
def _generate_im2col_2d(self, node):
34+
code_str = node.model.config.backend.generate_conv2d_line_buffer_fn(
35+
node.get_attr('index'),
36+
node.get_attr('n_partitions'),
37+
node.get_input_variable().shape[0],
38+
node.get_input_variable().shape[1],
39+
node.get_input_variable().shape[2],
40+
kernel=(node.get_attr('filt_height'), node.get_attr('filt_width')),
41+
stride=(node.get_attr('stride_height'), node.get_attr('stride_width')),
42+
pad=(node.get_attr('pad_top'), node.get_attr('pad_bottom'), node.get_attr('pad_left'), node.get_attr('pad_right'))
43+
)
44+
45+
node.set_attr('line_buffer_codegen', Source(code_str))
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import numpy as np
2+
3+
from hls4ml.model.optimizer import OptimizerPass
4+
from hls4ml.model.layers import Reshape
5+
6+
7+
class RemoveFinalReshape(OptimizerPass):
8+
''' Remove reshape if final layer '''
9+
def match(self, node):
10+
# match if reshape is final node
11+
return isinstance(node, Reshape) and not node.get_output_nodes()
12+
13+
def transform(self, model, node):
14+
if model.config.get_config_value('IOType') == 'io_parallel':
15+
print('WARNING: Final layer is a Reshape, which does not affect the output for io_parallel; removing it')
16+
# remove, but don't rewire because it's the output layer
17+
model.remove_node(node, rewire=False)
18+
return True
19+
elif model.config.get_config_value('IOType') == 'io_stream':
20+
print('WARNING: Final layer is a Reshape, which may incur a large resource cost for io_stream; consider removing it')
21+
return False

0 commit comments

Comments
 (0)