@@ -22,15 +22,15 @@ const int unroll_factor[2][5] = {
2222 /* ELTWISE_ADD_NO_CONVERT = */ 1 ,
2323 /* ELTWISE_SUB_NO_CONVERT = */ 1 ,
2424 /* ELTWISE_MUL_NO_CONVERT = */ 4 ,
25- /* ELTWISE_MAX_NO_CONVERT = */ 1 ,
26- /* ELTWISE_MIN_NO_CONVERT = */ 1
25+ /* ELTWISE_MAX_NO_CONVERT = */ 4 ,
26+ /* ELTWISE_MIN_NO_CONVERT = */ 4
2727 } ,
2828 {
2929 /* ELTWISE_ADD_CONVERT = */ 1 ,
3030 /* ELTWISE_SUB_CONVERT = */ 1 ,
3131 /* ELTWISE_MUL_CONVERT = */ 3 ,
32- /* ELTWISE_MAX_CONVERT = */ 1 ,
33- /* ELTWISE_MIN_CONVERT = */ 1
32+ /* ELTWISE_MAX_CONVERT = */ 3 ,
33+ /* ELTWISE_MIN_CONVERT = */ 3
3434 }
3535};
3636
@@ -549,6 +549,117 @@ void eltwise_innerloop(
549549 idx_out += num_lanes;
550550 }
551551}
552+ template <>
553+ MLI_FORCE_INLINE void eltwise_innerloop<int16_t , ELTWISE_MAX, false >(
554+ const MLI_PTR (int16_t ) __restrict op1_ptr,
555+ const MLI_PTR(int16_t ) __restrict op2_ptr,
556+ MLI_PTR(int16_t ) __restrict out_ptr,
557+ int idx1,
558+ int idx2,
559+ int idx_out,
560+ const int count,
561+ int16_t op1_s,
562+ int16_t op2_s,
563+ const bool scalar_op1,
564+ const bool scalar_op2,
565+ const int16_t in_offset1,
566+ const int16_t in_offset2,
567+ const int16_t out_offset,
568+ const int16_t scale1,
569+ const int16_t scale2,
570+ const int pre_op_shift1,
571+ const int pre_op_shift2,
572+ const int post_op_shift) {
573+ /* Dummy Load to get num_lanes, remaining part */
574+ auto input = mli_prv_load_1vec (op1_ptr);
575+ int num_lanes = get_number_lanes (input);
576+ int remaining_part = count & (num_lanes - 1 );
577+ decltype (input) op1_scalar = op1_s;
578+ decltype (input) op2_scalar = op2_s;
579+ const int convert_int = static_cast <int >(false );
580+ const int func_int = static_cast <int >(ELTWISE_MAX);
581+
582+ if (remaining_part) {
583+ auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec (op1_ptr + idx1);
584+ auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec (op2_ptr + idx2);
585+ auto res = mli::krn::eltwise_perform_operation<decltype (input), decltype (input), ELTWISE_MAX, false >(
586+ val1, val2, in_offset1, in_offset2, out_offset, scale1,
587+ scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
588+ mli_prv_store_n_samples (&out_ptr[idx_out], res, remaining_part);
589+ idx1 += remaining_part;
590+ idx2 += remaining_part;
591+ idx_out += remaining_part;
592+ }
593+
594+ #pragma clang loop unroll_count(unroll_factor[convert_int][func_int])
595+ for (int pos = 0 ; pos < (count - remaining_part); pos+=num_lanes) {
596+ auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec (op1_ptr + idx1);
597+ auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec (op2_ptr + idx2);
598+ auto res = mli::krn::eltwise_perform_operation<decltype (input), decltype (input), ELTWISE_MAX, false >(
599+ val1, val2, in_offset1, in_offset2, out_offset, scale1,
600+ scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
601+ mli_prv_store_n_samples (&out_ptr[idx_out], res);
602+ idx1 += num_lanes;
603+ idx2 += num_lanes;
604+ idx_out += num_lanes;
605+ }
606+ }
607+
608+ template <>
609+ MLI_FORCE_INLINE void eltwise_innerloop<int16_t , ELTWISE_MIN, false >(
610+ const MLI_PTR (int16_t ) __restrict op1_ptr,
611+ const MLI_PTR(int16_t ) __restrict op2_ptr,
612+ MLI_PTR(int16_t ) __restrict out_ptr,
613+ int idx1,
614+ int idx2,
615+ int idx_out,
616+ const int count,
617+ int16_t op1_s,
618+ int16_t op2_s,
619+ const bool scalar_op1,
620+ const bool scalar_op2,
621+ const int16_t in_offset1,
622+ const int16_t in_offset2,
623+ const int16_t out_offset,
624+ const int16_t scale1,
625+ const int16_t scale2,
626+ const int pre_op_shift1,
627+ const int pre_op_shift2,
628+ const int post_op_shift) {
629+ /* Dummy Load to get num_lanes, remaining part */
630+ auto input = mli_prv_load_1vec (op1_ptr);
631+ int num_lanes = get_number_lanes (input);
632+ int remaining_part = count & (num_lanes - 1 );
633+ decltype (input) op1_scalar = op1_s;
634+ decltype (input) op2_scalar = op2_s;
635+ const int convert_int = static_cast <int >(false );
636+ const int func_int = static_cast <int >(ELTWISE_MIN);
637+
638+ if (remaining_part) {
639+ auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec (op1_ptr + idx1);
640+ auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec (op2_ptr + idx2);
641+ auto res = mli::krn::eltwise_perform_operation<decltype (input), decltype (input), ELTWISE_MIN, false >(
642+ val1, val2, in_offset1, in_offset2, out_offset, scale1,
643+ scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
644+ mli_prv_store_n_samples (&out_ptr[idx_out], res, remaining_part);
645+ idx1 += remaining_part;
646+ idx2 += remaining_part;
647+ idx_out += remaining_part;
648+ }
649+
650+ #pragma clang loop unroll_count(unroll_factor[convert_int][func_int])
651+ for (int pos = 0 ; pos < (count - remaining_part); pos+=num_lanes) {
652+ auto val1 = (scalar_op1) ? op1_scalar : mli_prv_load_1vec (op1_ptr + idx1);
653+ auto val2 = (scalar_op2) ? op2_scalar : mli_prv_load_1vec (op2_ptr + idx2);
654+ auto res = mli::krn::eltwise_perform_operation<decltype (input), decltype (input), ELTWISE_MIN, false >(
655+ val1, val2, in_offset1, in_offset2, out_offset, scale1,
656+ scale2, pre_op_shift1, pre_op_shift2, post_op_shift);
657+ mli_prv_store_n_samples (&out_ptr[idx_out], res);
658+ idx1 += num_lanes;
659+ idx2 += num_lanes;
660+ idx_out += num_lanes;
661+ }
662+ }
552663
553664} // namespace vdsp
554665} // namespace krn
0 commit comments