@@ -642,22 +642,6 @@ template void requantize_block_32(const Requantize32 &qp, unsigned int width, un
642642/*
643643 * Routine (and helpers) to compute row sums needed for offset correction.
644644 *
645- * This is often needed for a lot of short rows (e.g. Syrax 5 - 6400 rows
646- * of length 27), therefore it's important not to sacrifice performance on
647- * odd length rows.
648- *
649- * To minimize performance loss in these cases, this routine will overread
650- * by up to 7 bytes.
651- *
652- * This is handled via "mask" and "mask mode" parameters to the inner
653- * routines; mask mode == 1 indicates that are between 1 and 8 bytes
654- * (inclusive) needed at the end; in these cases we always read 8 bytes.
655- * mask mode == 2 indicates that there are between 9 and 15 bytes needed at
656- * the end, and in this case we always read 16 bytes. In both cases the
657- * 'mask' vector is set up so that the read value can be masked off to clear
658- * the overread lanes. This is handled by 'accumulate_masked_8' and
659- * 'accumulate_masked_16' above.
660- *
661645 * This routine is templated on the type to be accumulated, because the
662646 * innermost instruction used needs to be of the correct signedness.
663647 * However, beyond this point we always use signed values in both cases.
@@ -670,19 +654,21 @@ template void requantize_block_32(const Requantize32 &qp, unsigned int width, un
670654 * accumulators. The 4 accumulators for up to 4 rows being processed are
671655 * then added together into a single output vector using pairwise adds.
672656 *
657+ * For odd lengths (not multiples of 16), the odd bytes are copied into a
658+ * temporary 16-byte buffer before calling the standard routine to avoid
659+ * overreading the input.
660+ *
673661 * This reduction from the 8x16-bit into the 4x32-bit accumulators needs to
674662 * occur before the 16-bit accumulators can overflow - which is every 32
675663 * iterations (512 total bytes processed). This is explained more below.
676664 */
677665namespace {
678666 struct row_sum_helpers {
679- const Requantize32 &qp;
680-
681667 /* Load a full 16 byte vector, pairwise accumulate into 'sum' with uadalp or sadalp */
682668 template <typename T>
683669 inline int16x8_t accumulate_16 (const T *ptr, int16x8_t sum);
684670
685- /* Load "odd" bytes */
671+ /* Handle "odd" bytes by copying to a temporary buffer */
686672 template <typename T>
687673 inline int16x8_t accumulate_odds_16 (const T *ptr, int16x8_t sum, size_t odds);
688674
@@ -792,8 +778,6 @@ namespace {
792778 UNREACHABLE (" Impossible." );
793779 }
794780 }
795-
796- row_sum_helpers (const Requantize32 &qp) : qp(qp) { }
797781 };
798782
799783 template <>
@@ -814,7 +798,7 @@ namespace {
814798 }
815799 return accumulate_16 (buffer, sum);
816800 }
817- }
801+ } // anonymous namespace
818802
819803template <typename T>
820804void compute_row_sums (const Requantize32 &qp, unsigned int width, unsigned int height,
@@ -825,7 +809,7 @@ void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int h
825809 return ;
826810 }
827811
828- row_sum_helpers thehelpers (qp) ;
812+ row_sum_helpers thehelpers;
829813
830814 const int32x4_t offset_mul = vdupq_n_s32 (-qp.b_offset );
831815
0 commit comments