Skip to content

Commit 99cc182

Browse files
Merge branch 'main' of github.com:pulp-platform/pulp-nnx into fconti/neureka
2 parents bbd4b7a + b4d7cd4 commit 99cc182

File tree

15 files changed

+216
-102
lines changed

15 files changed

+216
-102
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
- Support for kernels without normalization and quantization for NE16
99
- isort check
1010
- publication citation
11+
- support 32bit scale
1112

1213
### Changed
1314

1415
- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
1516
- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
1617
- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
18+
- `ne16_task_set_ptrs` split into `ne16_task_set_ptrs_conv` and `ne16_task_set_ptrs_norm_quant`
1719

1820
### Removed
1921

ne16/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
- [ ] Scale type
2929
- [x] uint8
3030
- [ ] uint16
31-
- [ ] uint32
31+
- [x] uint32
3232
- [x] Bias type
3333
- [x] int32
3434
- [ ] Weight type

ne16/hal/ne16_task.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,18 @@ uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
113113
return ptr - (padding_top * width + padding_left) * width_stride;
114114
}
115115

116-
void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
117-
uint32_t w_in_stride, uint8_t padding_top,
118-
uint8_t padding_left, uint32_t output_ptr,
119-
uint32_t weights_ptr, uint32_t scale_ptr,
120-
uint32_t shift_ptr, uint32_t bias_ptr) {
116+
void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
117+
uint32_t w_in, uint32_t w_in_stride,
118+
uint8_t padding_top, uint8_t padding_left,
119+
uint32_t output_ptr, uint32_t weights_ptr) {
121120
task->data.infeat_ptr =
122121
ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
123122
task->data.outfeat_ptr = output_ptr;
124123
task->data.weights_ptr = weights_ptr;
124+
}
125+
126+
void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
127+
uint32_t shift_ptr, uint32_t bias_ptr) {
125128
task->data.scale_ptr = scale_ptr;
126129
task->data.scale_shift_ptr = shift_ptr;
127130
task->data.scale_bias_ptr = bias_ptr;
@@ -206,8 +209,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
206209
}
207210

208211
void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
209-
const uint8_t right, const uint8_t bottom,
210-
const uint8_t left) {
212+
const uint8_t bottom, const uint8_t left,
213+
const uint8_t right) {
211214
task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
212215
((bottom & 0xff) << 8) | ((left & 0xff) << 0);
213216
}
@@ -219,8 +222,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
219222
const uint32_t h_out_stride,
220223
const uint32_t w_out_stride, const uint8_t padding_top,
221224
const uint8_t padding_bottom,
222-
const uint8_t padding_right,
223-
const uint8_t padding_left) {
225+
const uint8_t padding_left,
226+
const uint8_t padding_right) {
224227
ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
225228
w_out_stride);
226229
ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
@@ -235,8 +238,8 @@ void ne16_task_set_dims_stride2x2(
235238
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
236239
const uint32_t h_out_stride, const uint32_t w_out_stride,
237240
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
238-
const uint8_t padding_bottom, const uint8_t padding_right,
239-
const uint8_t padding_left) {
241+
const uint8_t padding_bottom, const uint8_t padding_left,
242+
const uint8_t padding_right) {
240243
const uint8_t stride = 2;
241244

242245
// WARNING: works only for even output channel stride (divisible by 2)

ne16/hal/ne16_task.h

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ typedef enum {
4242

4343
typedef struct ne16_norm_t {
4444
ne16_norm_mode_e mode;
45-
int flag_bias;
46-
int flag_shift;
45+
ne16_task_flag_e flag_bias;
46+
ne16_task_flag_e flag_shift;
4747
} ne16_norm_t;
4848

4949
typedef enum ne16_quant_mode_e {
@@ -59,9 +59,9 @@ typedef enum ne16_quant_function_e {
5959

6060
typedef struct ne16_quant_t {
6161
// Shift amount must be in range 0x00-0x1F
62-
unsigned shift_amount;
62+
uint8_t shift_amount;
6363
ne16_quant_function_e function;
64-
int flag_rounding;
64+
ne16_task_flag_e flag_rounding;
6565
} ne16_quant_t;
6666

6767
typedef struct ne16_stride_t {
@@ -133,11 +133,12 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
133133
uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
134134
const uint32_t width_stride, const uint8_t padding_top,
135135
const uint8_t padding_left);
136-
void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
137-
uint32_t w_in_stride, uint8_t padding_top,
138-
uint8_t padding_left, uint32_t output_ptr,
139-
uint32_t weights_ptr, uint32_t scale_ptr,
140-
uint32_t shift_ptr, uint32_t bias_ptr);
136+
void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
137+
uint32_t w_in, uint32_t w_in_stride,
138+
uint8_t padding_top, uint8_t padding_left,
139+
uint32_t output_ptr, uint32_t weights_ptr);
140+
void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
141+
uint32_t shift_ptr, uint32_t bias_ptr);
141142
/** ne16_task_set_strides
142143
*
143144
* All the strides variables are strides between elements alongside that
@@ -157,8 +158,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
157158
const uint8_t bottom, const uint8_t left,
158159
const uint8_t right, const uint8_t value);
159160
void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
160-
const uint8_t right, const uint8_t bottom,
161-
const uint8_t left);
161+
const uint8_t bottom, const uint8_t left,
162+
const uint8_t right);
162163
/** ne16_task_set_dims
163164
*
164165
* All the strides variables are strides between elements alongside that
@@ -172,8 +173,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
172173
const uint32_t h_out_stride,
173174
const uint32_t w_out_stride, const uint8_t padding_top,
174175
const uint8_t padding_bottom,
175-
const uint8_t padding_right,
176-
const uint8_t padding_left);
176+
const uint8_t padding_left,
177+
const uint8_t padding_right);
177178
/** ne16_task_set_dims_stride2x2
178179
*
179180
* All the strides variables are strides between elements alongside that
@@ -186,7 +187,7 @@ void ne16_task_set_dims_stride2x2(
186187
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
187188
const uint32_t h_out_stride, const uint32_t w_out_stride,
188189
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
189-
const uint8_t padding_bottom, const uint8_t padding_right,
190-
const uint8_t padding_left);
190+
const uint8_t padding_bottom, const uint8_t padding_left,
191+
const uint8_t padding_right);
191192

192193
#endif // !__NE16_TASK_H__

neureka/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,16 @@ Github repo [link](https://github.com/siracusa-soc/ne).
1616
- [x] Bias (w/ and w/o)
1717
- [ ] Per-channel shift
1818
- [x] Per-layer shift
19-
- [ ] Rounding
2019
- [x] Input type
2120
- [x] uint8
2221
- [x] int8
2322
- [x] Output type
2423
- [x] int8
2524
- [x] uint8 (only w/ Relu)
2625
- [x] int32
27-
- [ ] Scale type
26+
- [x] Scale type
2827
- [x] uint8
29-
- [ ] uint32
28+
- [x] uint32
3029
- [x] Bias type
3130
- [x] int32
3231
- [ ] Weight type

neureka/bsp/siracusa/neureka_siracusa_bsp.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
* SPDX-License-Identifier: Apache-2.0
1919
*/
2020

21-
#ifndef __NEUREKA_siracusa_BSP_H__
22-
#define __NEUREKA_siracusa_BSP_H__
21+
#ifndef __NEUREKA_SIRACUSA_BSP_H__
22+
#define __NEUREKA_SIRACUSA_BSP_H__
2323

2424
#include "neureka.h"
2525
#include <stdint.h>
@@ -64,4 +64,4 @@ void neureka_siracusa_close();
6464
void neureka_siracusa_event_wait_and_clear();
6565
const neureka_dev_t *neureka_siracusa_get_dev();
6666

67-
#endif // !__NEUREKA_siracusa_BSP_H__
67+
#endif // !__NEUREKA_SIRACUSA_BSP_H__

neureka/hal/neureka_task.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ void neureka_task_init(neureka_task_t *task) {
4747

4848
void neureka_task_set_op_to_conv(neureka_task_t *task,
4949
const uint8_t kernel_shape,
50-
const uint8_t depthwise,
51-
const uint8_t stride) {
50+
const uint8_t depthwise) {
5251
task->depthwise = depthwise;
5352
task->kernel_shape = kernel_shape;
5453
task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
@@ -133,16 +132,18 @@ uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
133132
return ptr - (padding_top * width + padding_left) * width_stride;
134133
}
135134

136-
void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
137-
uint32_t w_in, uint32_t w_in_stride,
138-
uint8_t padding_top, uint8_t padding_left,
139-
uint32_t output_ptr, uint32_t weights_ptr,
140-
uint32_t scale_ptr, uint32_t shift_ptr,
141-
uint32_t bias_ptr) {
135+
void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
136+
uint32_t w_in, uint32_t w_in_stride,
137+
uint8_t padding_top, uint8_t padding_left,
138+
uint32_t output_ptr, uint32_t weights_ptr) {
142139
task->data.infeat_ptr =
143140
neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
144141
task->data.outfeat_ptr = output_ptr;
145142
task->data.weights_ptr = weights_ptr;
143+
}
144+
145+
void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
146+
uint32_t shift_ptr, uint32_t bias_ptr) {
146147
task->data.scale_ptr = scale_ptr;
147148
task->data.scale_shift_ptr = shift_ptr;
148149
task->data.scale_bias_ptr = bias_ptr;
@@ -223,8 +224,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
223224
}
224225

225226
void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
226-
const uint8_t right, const uint8_t bottom,
227-
const uint8_t left) {
227+
const uint8_t bottom, const uint8_t left,
228+
const uint8_t right) {
228229
task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
229230
((bottom & 0xff) << 8) | ((left & 0xff) << 0);
230231
}
@@ -235,7 +236,7 @@ void neureka_task_set_dims(
235236
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
236237
const uint32_t h_out_stride, const uint32_t w_out_stride,
237238
const uint8_t padding_top, const uint8_t padding_bottom,
238-
const uint8_t padding_right, const uint8_t padding_left) {
239+
const uint8_t padding_left, const uint8_t padding_right) {
239240
neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
240241
w_out_stride);
241242
neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,

neureka/hal/neureka_task.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ typedef enum {
5151

5252
typedef struct neureka_norm_t {
5353
neureka_norm_mode_e mode;
54-
int flag_bias;
55-
int flag_shift;
54+
neureka_task_flag_e flag_bias;
55+
neureka_task_flag_e flag_shift;
5656
} neureka_norm_t;
5757

5858
typedef enum neureka_quant_mode_e {
@@ -67,9 +67,9 @@ typedef enum neureka_quant_function_e {
6767

6868
typedef struct neureka_quant_t {
6969
// Shift amount must be in range 0x00-0x1F
70-
unsigned shift_amount;
70+
uint8_t shift_amount;
7171
neureka_quant_function_e function;
72-
int flag_rounding;
72+
neureka_task_flag_e flag_rounding;
7373
} neureka_quant_t;
7474

7575
typedef struct neureka_stride_t {
@@ -128,7 +128,7 @@ typedef struct neureka_task_t {
128128
void neureka_task_init(neureka_task_t *task);
129129
void neureka_task_set_op_to_conv(neureka_task_t *task,
130130
const uint8_t kernel_shape,
131-
const uint8_t depthwise, const uint8_t stride);
131+
const uint8_t depthwise);
132132
void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
133133
const uint8_t output_bits,
134134
const uint8_t weight_bits);
@@ -147,12 +147,12 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
147147
uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
148148
const uint32_t width_stride, const uint8_t padding_top,
149149
const uint8_t padding_left);
150-
void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
151-
uint32_t w_in, uint32_t w_in_stride,
152-
uint8_t padding_top, uint8_t padding_left,
153-
uint32_t output_ptr, uint32_t weights_ptr,
154-
uint32_t scale_ptr, uint32_t shift_ptr,
155-
uint32_t bias_ptr);
150+
void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
151+
uint32_t w_in, uint32_t w_in_stride,
152+
uint8_t padding_top, uint8_t padding_left,
153+
uint32_t output_ptr, uint32_t weights_ptr);
154+
void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
155+
uint32_t shift_ptr, uint32_t bias_ptr);
156156
/** neureka_task_set_strides
157157
*
158158
* All the strides variables are strides between elements alongside that
@@ -173,8 +173,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
173173
const uint8_t bottom, const uint8_t left,
174174
const uint8_t right, const uint8_t value);
175175
void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
176-
const uint8_t right, const uint8_t bottom,
177-
const uint8_t left);
176+
const uint8_t bottom, const uint8_t left,
177+
const uint8_t right);
178178
/** neureka_task_set_dims
179179
*
180180
* All the strides variables are strides between elements alongside that
@@ -187,6 +187,6 @@ void neureka_task_set_dims(
187187
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
188188
const uint32_t h_out_stride, const uint32_t w_out_stride,
189189
const uint8_t padding_top, const uint8_t padding_bottom,
190-
const uint8_t padding_right, const uint8_t padding_left);
190+
const uint8_t padding_left, const uint8_t padding_right);
191191

192192
#endif // !__NEUREKA_TASK_H__

test/NeuralEngineFunctionalModel.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,24 +28,34 @@ def _norm_quant(
2828
bias_type: Optional[IntegerType],
2929
has_bias: bool,
3030
has_relu: bool,
31+
verbose: bool,
3132
) -> torch.Tensor:
3233
# Scale accumulators are in 48bit, so keeping the data in 64bit
3334
tensor = tensor * scale
3435
assert tensor.dtype == torch.int64
3536

37+
if verbose:
38+
print("INTERMEDIATE RESULTS (after scale):")
39+
print(tensor)
40+
3641
if has_bias:
3742
assert bias is not None
3843
assert bias_type is not None
39-
# Saturating cast to int32
44+
4045
tensor = NeuralEngineFunctionalModel._cast(
41-
tensor, bias_type, saturate=True
46+
tensor, bias_type, saturate=False
4247
).type(torch.int32)
4348

4449
tensor = tensor + bias
50+
4551
tensor = NeuralEngineFunctionalModel._cast(
46-
tensor, bias_type, saturate=False
52+
tensor, bias_type, saturate=True
4753
).type(torch.int32)
4854

55+
if verbose:
56+
print("INTERMEDIATE RESULTS (after bias):")
57+
print(tensor)
58+
4959
if has_relu:
5060
tensor = F.relu(tensor)
5161

@@ -118,6 +128,7 @@ def convolution(
118128
bias_type,
119129
has_bias,
120130
has_relu,
131+
verbose,
121132
)
122133

123134
return output

test/NeurekaMemoryLayout.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020
import numpy as np
2121
import numpy.typing as npt
2222

23-
from TestClasses import IntegerType
24-
2523

2624
class NeurekaMemoryLayout:
2725
_WEIGHT_BANDWIDTH = 256

0 commit comments

Comments
 (0)