Skip to content

Commit fd088ac

Browse files
improve arm_correlate_q7 for CM0 (ARM-software#178)
Compilers GCC/CLANG unable to detect code similarities and merge __SSATs. Let's help them emitting better code. Co-authored-by: Christophe Favergeon <[email protected]>
1 parent a9c26d6 commit fd088ac

File tree

1 file changed

+6
-8
lines changed

1 file changed

+6
-8
lines changed

Source/FilteringFunctions/arm_correlate_q7.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -921,15 +921,15 @@ void arm_correlate_q7(
921921
const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
922922
q31_t sum; /* Accumulator */
923923
uint32_t i = 0U, j; /* Loop counters */
924-
uint32_t inv = 0U; /* Reverse order flag */
924+
int32_t inc = 1; /* Destination address modifier */
925925
uint32_t tot = 0U; /* Length */
926926

927927
/* The algorithm implementation is based on the lengths of the inputs. */
928928
/* srcB is always made to slide across srcA. */
929929
/* So srcBLen is always considered as shorter or equal to srcALen */
930930
/* But CORR(x, y) is reverse of CORR(y, x) */
931931
/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
932-
/* and a varaible, inv is set to 1 */
932+
/* and a varaible, inc is set to -1 */
933933
/* If lengths are not equal then zero pad has to be done to make the two
934934
* inputs of same length. But to improve the performance, we include zeroes
935935
* in the output instead of zero padding either of the the inputs*/
@@ -968,8 +968,8 @@ void arm_correlate_q7(
968968
srcALen = srcBLen;
969969
srcBLen = j;
970970

971-
/* Setting the reverse flag */
972-
inv = 1;
971+
/* Filling destination in reverse order */
972+
inc = -1;
973973
}
974974

975975
/* Loop to calculate convolution for output length number of times */
@@ -990,10 +990,8 @@ void arm_correlate_q7(
990990
}
991991

992992
/* Store the output in the destination buffer */
993-
if (inv == 1)
994-
*pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
995-
else
996-
*pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
993+
*pDst = (q7_t) __SSAT((sum >> 7U), 8U);
994+
pDst += inc;
997995
}
998996

999997
#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */

0 commit comments

Comments
 (0)