Skip to content

Commit 4b6c6ee

Browse files
authored
Merge pull request #118 from foss-for-synopsys-dwc-arc-processors/slicing
Add sub tensor functionality to support slicing
2 parents 8bf0ae2 + 2b1eede commit 4b6c6ee

File tree

7 files changed

+148
-13
lines changed

7 files changed

+148
-13
lines changed

include/api/mli_helpers_api.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,24 @@ mli_status mli_hlp_convert_tensor(mli_tensor *in, mli_tensor *out);
111111
*/
112112
mli_status mli_hlp_point_to_subtensor(const mli_tensor *in, const mli_point_to_subtsr_cfg *cfg, mli_tensor *out);
113113

114+
/**
115+
* @brief Create a Sub-Tensor from a larger tensor
116+
*
117+
* @detail This function points to sub tensors in input tensor. This function performs operations
118+
* on pointers and doesn’t copy data (only points to subsequence of data in input).
119+
* For this reason, depending on the parameters, it can happen that the sub tensor contains
120+
* data that is not adjacent in memory.
121+
*
122+
* For more info on primitive see MLI Documentation
123+
*
124+
* @param in [I] Input tensor (of any shape)
125+
* @param cfg [I] Configuration structure (for more info see @ref mli_sub_tensor_cfg)
126+
* @param out [O] Output tensor. Result will be stored here
127+
*
128+
* @return MLI status code
129+
*/
130+
mli_status mli_hlp_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out);
131+
114132
uint32_t mli_hlp_tensor_scale_shift(const mli_tensor *in);
115133

116134
int16_t mli_hlp_tensor_scale(const mli_tensor *in, const uint32_t scale_idx);

include/mli_types.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,20 @@ typedef struct {
306306
uint8_t first_out_dim_size; /**< First output dimension size */
307307
} mli_point_to_subtsr_cfg;
308308

309+
/**
310+
* @brief Create Subtensor helper config
311+
*
312+
* Data structure to provide coordinates and sizes of required subtensor in the input tensor
313+
* The size can be reduced in any dimension.
314+
*/
315+
typedef struct {
316+
uint32_t offset[MLI_MAX_RANK]; /**< subtensor start coordinates in the input tensor
317+
The size of this array is determined by the rank of the input tensor*/
318+
uint32_t size[MLI_MAX_RANK]; /**< Size of the sub tensor in elements per dimension
319+
the number of entries in this array is determind by the input tensor */
320+
uint32_t sub_tensor_rank; /**< Rank of the sub tensor that will be produced */
321+
} mli_sub_tensor_cfg;
322+
309323
/**
310324
* @brief Data layout type for vision kernels (convolutions/pooloing mostly).
311325
*

lib/src/helpers/src/mli_helpers.c

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,68 @@ mli_status mli_hlp_point_to_subtensor(const mli_tensor *in, const mli_point_to_s
157157
return MLI_STATUS_OK;
158158
}
159159

160+
mli_status mli_hlp_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out) {
161+
mli_status ret = MLI_CHECK_STATUS(mli_chk_create_subtensor(in, cfg, out), __func__);
162+
if (ret != MLI_STATUS_OK)
163+
return ret;
164+
165+
const uint32_t elem_size = mli_hlp_tensor_element_size(in);
166+
const uint32_t out_rank = cfg->sub_tensor_rank;
167+
uint32_t mem_strides[MLI_MAX_RANK];
168+
const uint32_t input_rank = in->rank;
169+
const bool isAsym = (in->el_type == MLI_EL_ASYM_I8) || (in->el_type == MLI_EL_ASYM_I32);
170+
171+
// compute memory strides for the input tensor if not yet provided by the input tensor.
172+
mem_strides[input_rank - 1] = in->mem_stride[input_rank - 1] != 0 ? in->mem_stride[input_rank - 1] : 1;
173+
for (int i = input_rank - 2; i >= 0; i--) {
174+
mem_strides[i] = in->mem_stride[i] != 0 ? in->mem_stride[i] : mem_strides[i+1] * in->shape[i+1];
175+
}
176+
177+
// compute the offset inside the buffer
178+
int buf_offset = 0;
179+
for (int i = 0; i < input_rank; i++) {
180+
buf_offset += cfg->offset[i] * mem_strides[i];
181+
}
182+
buf_offset *= elem_size;
183+
out->data = (void *)((char *)in->data + buf_offset);
184+
out->capacity = in->capacity - buf_offset;
185+
186+
// Fill the shape[] of the output tensor.
187+
// If the sub_tensor_rank is smaller than the input rank, the dimensions with
188+
// a size of 1 will be removed in the output shape starting from the first dimension
189+
// until the requested sub_tensor_rank value is reached.
190+
int out_idx = 0;
191+
int skip_cnt = input_rank - out_rank;
192+
int out_asym_dim = -1;
193+
int out_asym_offset = 0;
194+
for (int in_idx = 0; in_idx < input_rank; in_idx++) {
195+
if ((skip_cnt > 0) && (cfg->size[in_idx] == 1)) {
196+
skip_cnt--;
197+
continue;
198+
}
199+
out->shape[out_idx] = cfg->size[in_idx];
200+
out->mem_stride[out_idx] = mem_strides[in_idx];
201+
if (isAsym && (in->el_params.asym.dim == in_idx)) {
202+
out_asym_dim = out_idx;
203+
out_asym_offset = cfg->offset[in_idx];
204+
}
205+
out_idx++;
206+
}
207+
208+
out->rank = out_rank;
209+
out->el_params = in->el_params;
210+
out->el_type = in->el_type;
211+
212+
if (isAsym){
213+
if (out->el_params.asym.dim >= 0) {
214+
out->el_params.asym.scale.pi16 += out_asym_offset;
215+
out->el_params.asym.dim = out_asym_dim;
216+
out->el_params.asym.zero_point.pi16 += out_asym_offset;
217+
}
218+
}
219+
return MLI_STATUS_OK;
220+
}
221+
160222

161223
mli_status mli_hlp_convert_tensor(mli_tensor *in, mli_tensor *out) {
162224
mli_status ret = MLI_CHECK_STATUS(mli_chk_convert_tensor(in, out), __func__);

lib/src/kernels/common/mli_krn_fully_connected.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ static inline void ip_op(
130130
const int out_shift,
131131
const int16_t input_offset,
132132
const int16_t output_offset) {
133+
const int left_shift = out_shift > 0 ? 0 : -out_shift;
134+
const int right_shift = out_shift > 0 ? out_shift : 0;
133135
// Matrix-Vector multiplication
134136
//==============================
135137
if (_Rarely(in_elements < 8)) {
@@ -143,14 +145,16 @@ static inline void ip_op(
143145
weights++;
144146
}
145147
in -= in_elements;
148+
149+
accu = mli_math_acc_ashift_fx(accu, -left_shift);
146150
accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
147151

148152
// adding the output offset needs to happen after the output mul and output shift
149153
// but before the cast to the output container size.
150154
// because the cast and shift are combined in one function, the output offset is
151-
// added before, and multiplied with 1<< out_shift to compensate.
152-
accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
153-
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
155+
// added before, and multiplied with 1<< right_shift to compensate.
156+
accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
157+
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
154158
}
155159
} else {
156160
if ((in_elements & 0x3) == 0) {
@@ -165,15 +169,16 @@ LOOP_PIPELINE_ENABLE
165169
weights += 4;
166170
}
167171
in -= in_elements;
168-
172+
173+
accu = mli_math_acc_ashift_fx(accu, -left_shift);
169174
accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
170175

171176
// adding the output offset needs to happen after the output mul and output shift
172177
// but before the cast to the output container size.
173178
// because the cast and shift are combined in one function, the output offset is
174-
// added before, and multiplied with 1<< out_shift to compensate.
175-
accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
176-
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
179+
// added before, and multiplied with 1<< right_shift to compensate.
180+
accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
181+
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
177182
}
178183
} else {
179184
for (int o_idx = 0; o_idx < out_elements; o_idx++) {
@@ -197,14 +202,16 @@ LOOP_PIPELINE_ENABLE
197202
weights += 4;
198203
}
199204
in -= in_elements;
205+
206+
accu = mli_math_acc_ashift_fx(accu, -left_shift);
200207
accu = mli_math_scale_mul<acc_T, true>(accu, out_mul);
201208

202209
// adding the output offset needs to happen after the output mul and output shift
203210
// but before the cast to the output container size.
204211
// because the cast and shift are combined in one function, the output offset is
205-
// added before, and multiplied with 1<< out_shift to compensate.
206-
accu = mli_math_mac_fx(accu, (int16_t)(1<<out_shift), (io_T)output_offset);
207-
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, out_shift);
212+
// added before, and multiplied with 1<< right_shift to compensate.
213+
accu = mli_math_mac_fx(accu, (int16_t)(1<<right_shift), (io_T)output_offset);
214+
out[o_idx] = mli_math_acc_cast_fx<io_T, acc_T> (accu, right_shift);
208215
}
209216
}
210217
}
@@ -224,7 +231,7 @@ static void fully_connected_prepare_and_run(
224231
MLI_CONV_OUT_PTR(io_T) out_ptr = (MLI_CONV_OUT_PTR(io_T)) (out->data);
225232

226233
int ch_out = bias->shape[0];
227-
int in_sz = mli_prv_count_elem_num(in);
234+
int in_sz = weights->shape[1];
228235

229236
// Define shift values
230237
int bias_shift = mli_prv_calc_shift(in, weights, bias);

lib/src/move/mli_mov_api.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,20 @@ mli_status mli_mov_prepare(mli_mov_handle_t* h, const mli_tensor* src, const mli
133133
dst->mem_stride[i] = 0;
134134
}
135135

136-
dst->mem_stride[rank - 1] = cfg->dst_mem_stride[rank - 1] != 0 ? cfg->dst_mem_stride[rank - 1] : 1;
136+
/* if destination memstride is provided in the configuration, use it.
137+
if not, check if the output tensor provides a mem stride.
138+
when no memstride is provided at all, compute the memstride based on the destination shape */
139+
if (cfg->dst_mem_stride[rank - 1] != 0) {
140+
dst->mem_stride[rank - 1] = cfg->dst_mem_stride[rank - 1];
141+
} else if (dst->mem_stride[rank - 1] == 0) {
142+
dst->mem_stride[rank - 1] = 1;
143+
}
137144
for (int i = rank - 2; i >= 0; i--) {
138-
dst->mem_stride[i] = cfg->dst_mem_stride[i] != 0 ? cfg->dst_mem_stride[i] : dst->mem_stride[i+1] * dst->shape[i + 1];
145+
if (cfg->dst_mem_stride[i] != 0) {
146+
dst->mem_stride[i] = cfg->dst_mem_stride[i];
147+
} else if (dst->mem_stride[i] == 0) {
148+
dst->mem_stride[i] = dst->mem_stride[i + 1] * dst->shape[i + 1];
149+
}
139150
}
140151

141152
// update state in the handle

lib/src/private/mli_check.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ mli_status mli_chk_permute_fx16(const mli_tensor * in, const mli_permute_cfg * c
325325
mli_status mli_chk_count_elem_num(const mli_tensor *in, uint32_t start_dim);
326326
mli_status mli_chk_convert_tensor(mli_tensor *in, mli_tensor *out);
327327
mli_status mli_chk_point_to_subtensor(const mli_tensor *in, const mli_point_to_subtsr_cfg *cfg, mli_tensor *out);
328+
mli_status mli_chk_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out);
328329

329330
#ifdef __cplusplus
330331
}

lib/src/private/src/mli_check.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,4 +1744,26 @@ mli_status mli_chk_point_to_subtensor(const mli_tensor *in, const mli_point_to_s
17441744
return MLI_STATUS_OK;
17451745
}
17461746

1747+
mli_status mli_chk_create_subtensor(const mli_tensor *in, const mli_sub_tensor_cfg *cfg, mli_tensor *out) {
1748+
mli_status stat = MLI_STATUS_OK;
1749+
1750+
// Check that in tensor is valid and out provides valid pointers
1751+
stat = MLI_CHECK_STATUS(mli_chk_tensor (in), "Bad input tensor");
1752+
if (stat != MLI_STATUS_OK) return stat;
1753+
if (MLI_CHECK(out != NULL , "Bad Output tensor pointer")) return MLI_STATUS_BAD_TENSOR;
1754+
1755+
if (MLI_CHECK(cfg != NULL , "Bad cfg pointer")) return MLI_STATUS_BAD_FUNC_CFG;
1756+
if (MLI_CHECK(cfg->sub_tensor_rank <= in->rank, "incorrect number of coordinates"))
1757+
return MLI_STATUS_BAD_FUNC_CFG;
1758+
1759+
for (int i = 0; i < in->rank; i++) {
1760+
if (MLI_CHECK(cfg->offset[i] < in->shape[i], "bad config"))
1761+
return MLI_STATUS_BAD_FUNC_CFG;
1762+
if (MLI_CHECK(cfg->offset[i] + cfg->size[i] <= in->shape[i], "bad config"))
1763+
return MLI_STATUS_BAD_FUNC_CFG;
1764+
}
1765+
1766+
return MLI_STATUS_OK;
1767+
}
1768+
17471769
#pragma code()

0 commit comments

Comments
 (0)