Skip to content

Commit 6dbcc09

Browse files
committed
refactor: gemm: update comment following previous fix.
Minor tidy up. Resolves: COMPMID-8396 Signed-off-by: David Mansell <[email protected]> Change-Id: I6f8d4a6f9a23b659174844fc0895ef9680ce873d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14684 Tested-by: Arm Jenkins <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Benchmark: Arm Jenkins <[email protected]> Reviewed-by: Gunes Bayir <[email protected]>
1 parent 6bc1c7b commit 6dbcc09

File tree

1 file changed

+7
-23
lines changed

1 file changed

+7
-23
lines changed

src/core/NEON/kernels/arm_gemm/quantized.cpp

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -642,22 +642,6 @@ template void requantize_block_32(const Requantize32 &qp, unsigned int width, un
642642
/*
643643
* Routine (and helpers) to compute row sums needed for offset correction.
644644
*
645-
* This is often needed for a lot of short rows (e.g. Syrax 5 - 6400 rows
646-
* of length 27), therefore it's important not to sacrifice performance on
647-
* odd length rows.
648-
*
649-
* To minimize performance loss in these cases, this routine will overread
650-
* by up to 7 bytes.
651-
*
652-
* This is handled via "mask" and "mask mode" parameters to the inner
653-
* routines; mask mode == 1 indicates that are between 1 and 8 bytes
654-
* (inclusive) needed at the end; in these cases we always read 8 bytes.
655-
* mask mode == 2 indicates that there are between 9 and 15 bytes needed at
656-
* the end, and in this case we always read 16 bytes. In both cases the
657-
* 'mask' vector is set up so that the read value can be masked off to clear
658-
* the overread lanes. This is handled by 'accumulate_masked_8' and
659-
* 'accumulate_masked_16' above.
660-
*
661645
* This routine is templated on the type to be accumulated, because the
662646
* innermost instruction used needs to be of the correct signedness.
663647
* However, beyond this point we always use signed values in both cases.
@@ -670,19 +654,21 @@ template void requantize_block_32(const Requantize32 &qp, unsigned int width, un
670654
* accumulators. The 4 accumulators for up to 4 rows being processed are
671655
* then added together into a single output vector using pairwise adds.
672656
*
657+
* For odd lengths (not multiples of 16), the odd bytes are copied into a
658+
* temporary 16-byte buffer before calling the standard routine to avoid
659+
* overreading the input.
660+
*
673661
* This reduction from the 8x16-bit into the 4x32-bit accumulators needs to
674662
* occur before the 16-bit accumulators can overflow - which is every 32
675663
* iterations (512 total bytes processed). This is explained more below.
676664
*/
677665
namespace {
678666
struct row_sum_helpers {
679-
const Requantize32 &qp;
680-
681667
/* Load a full 16 byte vector, pairwise accumulate into 'sum' with uadalp or sadalp */
682668
template<typename T>
683669
inline int16x8_t accumulate_16(const T *ptr, int16x8_t sum);
684670

685-
/* Load "odd" bytes */
671+
/* Handle "odd" bytes by copying to a temporary buffer */
686672
template<typename T>
687673
inline int16x8_t accumulate_odds_16(const T *ptr, int16x8_t sum, size_t odds);
688674

@@ -792,8 +778,6 @@ namespace {
792778
UNREACHABLE("Impossible.");
793779
}
794780
}
795-
796-
row_sum_helpers(const Requantize32 &qp) : qp(qp) { }
797781
};
798782

799783
template<>
@@ -814,7 +798,7 @@ namespace {
814798
}
815799
return accumulate_16(buffer, sum);
816800
}
817-
}
801+
} // anonymous namespace
818802

819803
template<typename T>
820804
void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,
@@ -825,7 +809,7 @@ void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int h
825809
return;
826810
}
827811

828-
row_sum_helpers thehelpers(qp);
812+
row_sum_helpers thehelpers;
829813

830814
const int32x4_t offset_mul = vdupq_n_s32(-qp.b_offset);
831815

0 commit comments

Comments
 (0)