Skip to content

Commit af60572

Browse files
Hakim7267JaccovG
authored andcommitted
unroll eltwise max/min
1 parent 68e502e commit af60572

File tree

2 files changed

+159
-4
lines changed

2 files changed

+159
-4
lines changed

lib/src/kernels/eltwise/impl/mli_krn_eltwise_vdsp.h

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@ const int unroll_factor[2][5] = {
2222
/* ELTWISE_ADD_NO_CONVERT = */ 1,
2323
/* ELTWISE_SUB_NO_CONVERT = */ 1,
2424
/* ELTWISE_MUL_NO_CONVERT = */ 4,
25-
/* ELTWISE_MAX_NO_CONVERT = */ 1,
26-
/* ELTWISE_MIN_NO_CONVERT = */ 1
25+
/* ELTWISE_MAX_NO_CONVERT = */ 4,
26+
/* ELTWISE_MIN_NO_CONVERT = */ 4
2727
} ,
2828
{
2929
/* ELTWISE_ADD_CONVERT = */ 1,
3030
/* ELTWISE_SUB_CONVERT = */ 1,
3131
/* ELTWISE_MUL_CONVERT = */ 3,
32-
/* ELTWISE_MAX_CONVERT = */ 1,
33-
/* ELTWISE_MIN_CONVERT = */ 1
32+
/* ELTWISE_MAX_CONVERT = */ 3,
33+
/* ELTWISE_MIN_CONVERT = */ 3
3434
}
3535
};
3636

@@ -549,6 +549,117 @@ void eltwise_innerloop(
549549
idx_out += num_lanes;
550550
}
551551
}
552+
template<>
553+
MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>(
554+
const MLI_PTR(int16_t) __restrict op1_ptr,
555+
const MLI_PTR(int16_t) __restrict op2_ptr,
556+
MLI_PTR(int16_t) __restrict out_ptr,
557+
int idx1,
558+
int idx2,
559+
int idx_out,
560+
const int count,
561+
int16_t op1_s,
562+
int16_t op2_s,
563+
const bool scalar_op1,
564+
const bool scalar_op2,
565+
const int16_t in_offset1,
566+
const int16_t in_offset2,
567+
const int16_t out_offset,
568+
const int16_t scale1,
569+
const int16_t scale2,
570+
const int pre_op_shift1,
571+
const int pre_op_shift2,
572+
const int post_op_shift) {
573+
/* Dummy Load to get num_lanes, remaining part */
574+
auto input = mli_prv_load_1vec(op1_ptr);
575+
int num_lanes = get_number_lanes(input);
576+
int remaining_part = count & (num_lanes - 1);
577+
decltype(input) op1_scalar = op1_s;
578+
decltype(input) op2_scalar = op2_s;
579+
const int convert_int = static_cast<int>(false);
580+
const int func_int = static_cast<int>(ELTWISE_MAX);
581+
582+
if (remaining_part) {
583+
auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec(op1_ptr + idx1);
584+
auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec(op2_ptr + idx2);
585+
auto res = mli::krn::eltwise_perform_operation<decltype(input), decltype(input), ELTWISE_MAX, false>(
586+
val1, val2, in_offset1, in_offset2, out_offset, scale1,
587+
scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
588+
mli_prv_store_n_samples(&out_ptr[idx_out], res, remaining_part);
589+
idx1 += remaining_part;
590+
idx2 += remaining_part;
591+
idx_out += remaining_part;
592+
}
593+
594+
#pragma clang loop unroll_count(unroll_factor[convert_int][func_int])
595+
for (int pos = 0; pos < (count - remaining_part); pos+=num_lanes) {
596+
auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec(op1_ptr + idx1);
597+
auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec(op2_ptr + idx2);
598+
auto res = mli::krn::eltwise_perform_operation<decltype(input), decltype(input), ELTWISE_MAX, false>(
599+
val1, val2, in_offset1, in_offset2, out_offset, scale1,
600+
scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
601+
mli_prv_store_n_samples(&out_ptr[idx_out], res);
602+
idx1 += num_lanes;
603+
idx2 += num_lanes;
604+
idx_out += num_lanes;
605+
}
606+
}
607+
608+
template<>
609+
MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MIN, false>(
610+
const MLI_PTR(int16_t) __restrict op1_ptr,
611+
const MLI_PTR(int16_t) __restrict op2_ptr,
612+
MLI_PTR(int16_t) __restrict out_ptr,
613+
int idx1,
614+
int idx2,
615+
int idx_out,
616+
const int count,
617+
int16_t op1_s,
618+
int16_t op2_s,
619+
const bool scalar_op1,
620+
const bool scalar_op2,
621+
const int16_t in_offset1,
622+
const int16_t in_offset2,
623+
const int16_t out_offset,
624+
const int16_t scale1,
625+
const int16_t scale2,
626+
const int pre_op_shift1,
627+
const int pre_op_shift2,
628+
const int post_op_shift) {
629+
/* Dummy Load to get num_lanes, remaining part */
630+
auto input = mli_prv_load_1vec(op1_ptr);
631+
int num_lanes = get_number_lanes(input);
632+
int remaining_part = count & (num_lanes - 1);
633+
decltype(input) op1_scalar = op1_s;
634+
decltype(input) op2_scalar = op2_s;
635+
const int convert_int = static_cast<int>(false);
636+
const int func_int = static_cast<int>(ELTWISE_MIN);
637+
638+
if (remaining_part) {
639+
auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec(op1_ptr + idx1);
640+
auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec(op2_ptr + idx2);
641+
auto res = mli::krn::eltwise_perform_operation<decltype(input), decltype(input), ELTWISE_MIN, false>(
642+
val1, val2, in_offset1, in_offset2, out_offset, scale1,
643+
scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
644+
mli_prv_store_n_samples(&out_ptr[idx_out], res, remaining_part);
645+
idx1 += remaining_part;
646+
idx2 += remaining_part;
647+
idx_out += remaining_part;
648+
}
649+
650+
#pragma clang loop unroll_count(unroll_factor[convert_int][func_int])
651+
for (int pos = 0; pos < (count - remaining_part); pos+=num_lanes) {
652+
auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec(op1_ptr + idx1);
653+
auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec(op2_ptr + idx2);
654+
auto res = mli::krn::eltwise_perform_operation<decltype(input), decltype(input), ELTWISE_MIN, false>(
655+
val1, val2, in_offset1, in_offset2, out_offset, scale1,
656+
scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
657+
mli_prv_store_n_samples(&out_ptr[idx_out], res);
658+
idx1 += num_lanes;
659+
idx2 += num_lanes;
660+
idx_out += num_lanes;
661+
}
662+
}
552663

553664
} // namespace vdsp
554665
} // namespace krn

lib/src/kernels/eltwise/mli_krn_eltwise_decl.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,50 @@ void eltwise_innerloop(
368368
const int pre_op_shift1,
369369
const int pre_op_shift2,
370370
const int post_op_shift);
371+
372+
template<>
373+
MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MAX, false>(
374+
const MLI_PTR(int16_t) __restrict op1_ptr,
375+
const MLI_PTR(int16_t) __restrict op2_ptr,
376+
MLI_PTR(int16_t) __restrict out_ptr,
377+
int idx1,
378+
int idx2,
379+
int idx_out,
380+
const int count,
381+
int16_t op1_s,
382+
int16_t op2_s,
383+
const bool scalar_op1,
384+
const bool scalar_op2,
385+
const int16_t in_offset1,
386+
const int16_t in_offset2,
387+
const int16_t out_offset,
388+
const int16_t scale1,
389+
const int16_t scale2,
390+
const int pre_op_shift1,
391+
const int pre_op_shift2,
392+
const int post_op_shift);
393+
394+
template<>
395+
MLI_FORCE_INLINE void eltwise_innerloop<int16_t, ELTWISE_MIN, false>(
396+
const MLI_PTR(int16_t) __restrict op1_ptr,
397+
const MLI_PTR(int16_t) __restrict op2_ptr,
398+
MLI_PTR(int16_t) __restrict out_ptr,
399+
int idx1,
400+
int idx2,
401+
int idx_out,
402+
const int count,
403+
int16_t op1_s,
404+
int16_t op2_s,
405+
const bool scalar_op1,
406+
const bool scalar_op2,
407+
const int16_t in_offset1,
408+
const int16_t in_offset2,
409+
const int16_t out_offset,
410+
const int16_t scale1,
411+
const int16_t scale2,
412+
const int pre_op_shift1,
413+
const int pre_op_shift2,
414+
const int post_op_shift);
371415
} // namespace vdsp
372416
} // namespace krn
373417
} // namespace mli

0 commit comments

Comments
 (0)