-
Notifications
You must be signed in to change notification settings - Fork 347
Open
Description
The bfly functions do multiple loads and stores of Fout
e.g.
C_ADDTO(*Fout,scratch[3]);
Code size and performance is improved by loading Fout at the top of the loop and storing it at the bottom, using registers for the operations.
On ARMv7 the bfly4 function main loop
Was
170 instructions
46 loads
36 stores
Now
140 instructions
35 loads
19 stores
This is bfly4 with the change:
static void kf_bfly4(
kiss_fft_cpx * Fout,
const size_t fstride,
const kiss_fft_cfg st,
const size_t m
)
{
kiss_fft_cpx *tw1,*tw2,*tw3;
kiss_fft_cpx scratch[6];
kiss_fft_cpx Fout0, Fout1, Fout2, Fout3;
size_t k=m;
const size_t m2=2*m;
const size_t m3=3*m;
tw3 = tw2 = tw1 = st->twiddles;
do {
Fout0 = Fout[0];
Fout1 = Fout[m];
Fout2 = Fout[m2];
Fout3 = Fout[m3];
C_FIXDIV(Fout0,4); C_FIXDIV(Fout1,4); C_FIXDIV(Fout2,4); C_FIXDIV(Fout3,4);
C_MUL(scratch[0],Fout1 , *tw1 );
C_MUL(scratch[1],Fout2 , *tw2 );
C_MUL(scratch[2],Fout3 , *tw3 );
C_SUB( scratch[5] , Fout0, scratch[1] );
C_ADDTO(Fout0, scratch[1]);
C_ADD( scratch[3] , scratch[0] , scratch[2] );
C_SUB( scratch[4] , scratch[0] , scratch[2] );
C_SUB( Fout2, Fout0, scratch[3] );
tw1 += fstride;
tw2 += fstride*2;
tw3 += fstride*3;
C_ADDTO( Fout0 , scratch[3] );
if(st->inverse) {
Fout1.r = scratch[5].r - scratch[4].i;
Fout1.i = scratch[5].i + scratch[4].r;
Fout3.r = scratch[5].r + scratch[4].i;
Fout3.i = scratch[5].i - scratch[4].r;
}else{
Fout1.r = scratch[5].r + scratch[4].i;
Fout1.i = scratch[5].i - scratch[4].r;
Fout3.r = scratch[5].r - scratch[4].i;
Fout3.i = scratch[5].i + scratch[4].r;
}
Fout[0] = Fout0;
Fout[m] = Fout1;
Fout[m2] = Fout2;
Fout[m3] = Fout3;
++Fout;
}while(--k);
}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels