Skip to content

Commit 1d409d2

Browse files
author
Phil Wang
committed
Improve data layout in RFFT float32
Transpose part of twiddles in RFFT float32 to avoid memory access by a large stride. Change-Id: I5e05c5baed523183ed3948371e6b1fbffc916e9b
1 parent 26b199d commit 1d409d2

File tree

4 files changed

+96
-39
lines changed

4 files changed

+96
-39
lines changed

modules/dsp/NE10_fft.c

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,10 @@ ne10_int32_t ne10_factor (ne10_int32_t n,
149149
return NE10_OK;
150150
}
151151

152-
// Twiddles matrix [mstride][radix-1]
153-
// First column (k == 0)is ignored because phase == 1, and
152+
// Twiddles matrix [radix-1][mstride]
153+
// First column (k == 0) is ignored because phase == 1, and
154154
// twiddle = (1.0, 0.0).
155-
static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
155+
void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
156156
const ne10_int32_t mstride,
157157
const ne10_int32_t fstride,
158158
const ne10_int32_t radix,
@@ -173,6 +173,33 @@ static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * tw
173173
} // mstride
174174
}
175175

176+
// Transposed twiddles matrix [mstride][radix-1]
177+
// First row (k == 0) is ignored because phase == 1, and
178+
// twiddle = (1.0, 0.0).
179+
// Transposed twiddle tables are used in RFFT to avoid memory access by a large
180+
// stride.
181+
void ne10_fft_generate_twiddles_line_transposed_float32 (
182+
ne10_fft_cpx_float32_t* twiddles,
183+
const ne10_int32_t mstride,
184+
const ne10_int32_t fstride,
185+
const ne10_int32_t radix,
186+
const ne10_int32_t nfft)
187+
{
188+
ne10_int32_t j, k;
189+
ne10_float32_t phase;
190+
const ne10_float64_t pi = NE10_PI;
191+
192+
for (j = 0; j < mstride; j++)
193+
{
194+
for (k = 1; k < radix; k++) // phase = 1 when k = 0
195+
{
196+
phase = -2 * pi * fstride * k * j / nfft;
197+
twiddles[(radix - 1) * j + k - 1].r = (ne10_float32_t) cos (phase);
198+
twiddles[(radix - 1) * j + k - 1].i = (ne10_float32_t) sin (phase);
199+
} // radix
200+
} // mstride
201+
}
202+
176203
// Twiddles matrix [mstride][radix-1]
177204
// First column (k == 0)is ignored because phase == 1, and
178205
// twiddle = (1.0, 0.0).
@@ -232,9 +259,17 @@ ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * t
232259
return twiddles;
233260
}
234261

235-
ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
236-
const ne10_int32_t * factors,
237-
const ne10_int32_t nfft )
262+
typedef void (*line_generator_float32)(ne10_fft_cpx_float32_t*,
263+
const ne10_int32_t,
264+
const ne10_int32_t,
265+
const ne10_int32_t,
266+
const ne10_int32_t);
267+
268+
ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_impl_float32 (
269+
line_generator_float32 generator,
270+
ne10_fft_cpx_float32_t * twiddles,
271+
const ne10_int32_t * factors,
272+
const ne10_int32_t nfft)
238273
{
239274
ne10_int32_t stage_count = factors[0];
240275
ne10_int32_t fstride = factors[1];
@@ -248,7 +283,7 @@ ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32
248283
twiddles[0].r = 1.0;
249284
twiddles[0].i = 0.0;
250285
twiddles += 1;
251-
ne10_fft_generate_twiddles_line_float32 (twiddles, 1, fstride, cur_radix, nfft);
286+
generator (twiddles, 1, fstride, cur_radix, nfft);
252287
twiddles += cur_radix - 1;
253288
}
254289
stage_count --;
@@ -259,13 +294,35 @@ ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32
259294
cur_radix = factors[2 * stage_count];
260295
fstride /= cur_radix;
261296
mstride = factors[2 * stage_count + 1];
262-
ne10_fft_generate_twiddles_line_float32 (twiddles, mstride, fstride, cur_radix, nfft);
297+
generator (twiddles, mstride, fstride, cur_radix, nfft);
263298
twiddles += mstride * (cur_radix - 1);
264299
} // stage_count
265300

266301
return twiddles;
267302
}
268303

304+
ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
305+
const ne10_int32_t * factors,
306+
const ne10_int32_t nfft )
307+
{
308+
line_generator_float32 generator = ne10_fft_generate_twiddles_line_float32;
309+
twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
310+
twiddles, factors, nfft);
311+
return twiddles;
312+
}
313+
314+
ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
315+
ne10_fft_cpx_float32_t * twiddles,
316+
const ne10_int32_t * factors,
317+
const ne10_int32_t nfft)
318+
{
319+
line_generator_float32 generator =
320+
ne10_fft_generate_twiddles_line_transposed_float32;
321+
twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
322+
twiddles, factors, nfft);
323+
return twiddles;
324+
}
325+
269326
/**
270327
* @addtogroup C2C_FFT_IFFT
271328
* @{

modules/dsp/NE10_fft.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ extern "C" {
7676
const ne10_int32_t * factors,
7777
const ne10_int32_t nfft );
7878

79+
extern ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
80+
ne10_fft_cpx_float32_t * twiddles,
81+
const ne10_int32_t * factors,
82+
const ne10_int32_t nfft );
83+
7984
extern ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * twiddles,
8085
const ne10_int32_t * factors,
8186
const ne10_int32_t nfft );

modules/dsp/NE10_rfft_float32.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,11 @@ ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft)
798798
return st;
799799
}
800800

801-
st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles_neon, st->r_factors_neon, nfft/4);
801+
// Twiddle table is transposed here to improve cache access performance.
802+
st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
803+
st->r_twiddles_neon,
804+
st->r_factors_neon,
805+
nfft/4);
802806

803807
// nfft/4 x 4
804808
tw = st->r_super_twiddles_neon;

modules/dsp/NE10_rfft_float32.neonintrinsic.c

Lines changed: 21 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -284,11 +284,11 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
284284
q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285285
q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
286286

287-
q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
288-
q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
287+
q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288+
q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
289289

290-
q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
291-
q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
290+
q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291+
q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
292292

293293
// R2C TW KERNEL
294294
NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
@@ -298,17 +298,13 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
298298
#else // __aarch64__
299299
const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
300300
const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
301-
const ne10_float32_t *ptr_tw = (const ne10_float32_t *) twiddles;
302-
303301
asm volatile (
304302
"ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
305303
"ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
306304
"ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
307305
"ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
308306
"ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
309-
"ld1 {v0.1d}, [%[ptr_tw]], %[offset_out] \n\t"
310-
"ld1 {v1.1d}, [%[ptr_tw]], %[offset_out] \n\t"
311-
"ld1 {v2.1d}, [%[ptr_tw]] \n\t"
307+
"ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
312308

313309
"fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
314310
"fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
@@ -333,10 +329,9 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
333329
[q2_out3r]"+w"(q2_out3.val[0]),
334330
[q2_out3i]"+w"(q2_out3.val[1]),
335331
[ptr_inr]"+r"(ptr_inr),
336-
[ptr_ini]"+r"(ptr_ini),
337-
[ptr_tw]"+r"(ptr_tw)
332+
[ptr_ini]"+r"(ptr_ini)
338333
: [offset_in]"r"(in_step * 16),
339-
[offset_out]"r"(out_step * 8)
334+
[ptr_tw]"r"(twiddles)
340335
: "memory", "v0", "v1", "v2",
341336
"v10", "v11", "v12", "v13", "v14", "v15"
342337
);
@@ -363,7 +358,7 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
363358
Fin_neon += 2;
364359
Fout_neon += 2;
365360
Fout_b -= 2;
366-
twiddles ++;
361+
twiddles += 3;
367362
}
368363
}
369364

@@ -399,11 +394,11 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
399394
q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
400395
q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
401396

402-
q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
403-
q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
397+
q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398+
q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
404399

405-
q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
406-
q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
400+
q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401+
q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
407402

408403
// NE10_PRINT_Q2x4_VECTOR(q2_in);
409404

@@ -429,7 +424,7 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
429424
Fin_neon += 2;
430425
Fout_neon += 2;
431426
Fin_b -= 2;
432-
twiddles ++;
427+
twiddles += 3;
433428
}
434429
}
435430

@@ -496,27 +491,25 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *F
496491

497492
for (f_count = fstride; f_count; f_count --)
498493
{
499-
tw = twiddles;
494+
tw = twiddles + 3;
500495

501496
// first butterfly
502-
ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
497+
ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
503498

504-
tw ++;
505499
Fin_neon ++;
506500
Fout_neon ++;
507501

508502
// other butterfly
503+
// Twiddle tables are transposed to avoid memory access by a large stride.
509504
ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
510505

511506
// update Fin_r, Fout_r, twiddles
512-
tw += ( (out_step >> 1) - 1);
513507
Fin_neon += 2 * ( (out_step >> 1) - 1);
514508
Fout_neon += 2 * ( (out_step >> 1) - 1);
515509

516510
// last butterfly
517-
ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
511+
ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
518512
Fin_neon ++;
519-
tw++;
520513
Fout_neon ++;
521514

522515
Fout_neon = Fout_neon + 3 * out_step;
@@ -540,27 +533,25 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *F
540533

541534
for (f_count = fstride; f_count; f_count --)
542535
{
543-
tw = twiddles;
536+
tw = twiddles + 3;
544537

545538
// first butterfly
546-
ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
539+
ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
547540

548-
tw ++;
549541
Fin_neon ++;
550542
Fout_neon ++;
551543

552544
// other butterfly
545+
// Twiddle tables are transposed to avoid memory access by a large stride.
553546
ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
554547

555548
// update Fin_r, Fout_r, twiddles
556-
tw += ( (out_step >> 1) - 1);
557549
Fin_neon += 2 * ( (out_step >> 1) - 1);
558550
Fout_neon += 2 * ( (out_step >> 1) - 1);
559551

560552
// last butterfly
561-
ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
553+
ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
562554
Fin_neon ++;
563-
tw++;
564555
Fout_neon ++;
565556

566557
Fin_neon = Fin_neon + 3 * out_step;

0 commit comments

Comments
 (0)