@@ -284,11 +284,11 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
284284 q2_tw0 .val [0 ] = vdupq_n_f32 (twiddles [0 ].r );
285285 q2_tw0 .val [1 ] = vdupq_n_f32 (twiddles [0 ].i );
286286
287- q2_tw1 .val [0 ] = vdupq_n_f32 (twiddles [out_step ].r );
288- q2_tw1 .val [1 ] = vdupq_n_f32 (twiddles [out_step ].i );
287+ q2_tw1 .val [0 ] = vdupq_n_f32 (twiddles [1 ].r );
288+ q2_tw1 .val [1 ] = vdupq_n_f32 (twiddles [1 ].i );
289289
290- q2_tw2 .val [0 ] = vdupq_n_f32 (twiddles [out_step * 2 ].r );
291- q2_tw2 .val [1 ] = vdupq_n_f32 (twiddles [out_step * 2 ].i );
290+ q2_tw2 .val [0 ] = vdupq_n_f32 (twiddles [2 ].r );
291+ q2_tw2 .val [1 ] = vdupq_n_f32 (twiddles [2 ].i );
292292
293293 // R2C TW KERNEL
294294 NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out , q2_in , q2_tw );
@@ -298,17 +298,13 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
298298#else // __aarch64__
299299 const ne10_float32_t * ptr_inr = ((const ne10_float32_t * ) Fin_neon );
300300 const ne10_float32_t * ptr_ini = ((const ne10_float32_t * ) Fin_neon ) + 4 ;
301- const ne10_float32_t * ptr_tw = (const ne10_float32_t * ) twiddles ;
302-
303301 asm volatile (
304302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
305303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
306304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
307305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
308306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
309- "ld1 {v0.1d}, [%[ptr_tw]], %[offset_out] \n\t"
310- "ld1 {v1.1d}, [%[ptr_tw]], %[offset_out] \n\t"
311- "ld1 {v2.1d}, [%[ptr_tw]] \n\t"
307+ "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
312308
313309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
314310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
@@ -333,10 +329,9 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
333329 [q2_out3r ]"+w" (q2_out3 .val [0 ]),
334330 [q2_out3i ]"+w" (q2_out3 .val [1 ]),
335331 [ptr_inr ]"+r" (ptr_inr ),
336- [ptr_ini ]"+r" (ptr_ini ),
337- [ptr_tw ]"+r" (ptr_tw )
332+ [ptr_ini ]"+r" (ptr_ini )
338333 : [offset_in ]"r" (in_step * 16 ),
339- [offset_out ]"r" (out_step * 8 )
334+ [ptr_tw ]"r" (twiddles )
340335 : "memory" , "v0" , "v1" , "v2" ,
341336 "v10" , "v11" , "v12" , "v13" , "v14" , "v15"
342337 );
@@ -363,7 +358,7 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
363358 Fin_neon += 2 ;
364359 Fout_neon += 2 ;
365360 Fout_b -= 2 ;
366- twiddles ++ ;
361+ twiddles += 3 ;
367362 }
368363}
369364
@@ -399,11 +394,11 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
399394 q2_tw0 .val [0 ] = vdupq_n_f32 (twiddles [0 ].r );
400395 q2_tw0 .val [1 ] = vdupq_n_f32 (twiddles [0 ].i );
401396
402- q2_tw1 .val [0 ] = vdupq_n_f32 (twiddles [out_step ].r );
403- q2_tw1 .val [1 ] = vdupq_n_f32 (twiddles [out_step ].i );
397+ q2_tw1 .val [0 ] = vdupq_n_f32 (twiddles [1 ].r );
398+ q2_tw1 .val [1 ] = vdupq_n_f32 (twiddles [1 ].i );
404399
405- q2_tw2 .val [0 ] = vdupq_n_f32 (twiddles [out_step * 2 ].r );
406- q2_tw2 .val [1 ] = vdupq_n_f32 (twiddles [out_step * 2 ].i );
400+ q2_tw2 .val [0 ] = vdupq_n_f32 (twiddles [2 ].r );
401+ q2_tw2 .val [1 ] = vdupq_n_f32 (twiddles [2 ].i );
407402
408403 // NE10_PRINT_Q2x4_VECTOR(q2_in);
409404
@@ -429,7 +424,7 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
429424 Fin_neon += 2 ;
430425 Fout_neon += 2 ;
431426 Fin_b -= 2 ;
432- twiddles ++ ;
427+ twiddles += 3 ;
433428 }
434429}
435430
@@ -496,27 +491,25 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *F
496491
497492 for (f_count = fstride ; f_count ; f_count -- )
498493 {
499- tw = twiddles ;
494+ tw = twiddles + 3 ;
500495
501496 // first butterfly
502- ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , tw );
497+ ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , NULL );
503498
504- tw ++ ;
505499 Fin_neon ++ ;
506500 Fout_neon ++ ;
507501
508502 // other butterfly
503+ // Twiddle tables are transposed to avoid memory access by a large stride.
509504 ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , tw );
510505
511506 // update Fin_r, Fout_r, twiddles
512- tw += ( (out_step >> 1 ) - 1 );
513507 Fin_neon += 2 * ( (out_step >> 1 ) - 1 );
514508 Fout_neon += 2 * ( (out_step >> 1 ) - 1 );
515509
516510 // last butterfly
517- ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon , Fin_neon , out_step , in_step , tw );
511+ ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon , Fin_neon , out_step , in_step , NULL );
518512 Fin_neon ++ ;
519- tw ++ ;
520513 Fout_neon ++ ;
521514
522515 Fout_neon = Fout_neon + 3 * out_step ;
@@ -540,27 +533,25 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *F
540533
541534 for (f_count = fstride ; f_count ; f_count -- )
542535 {
543- tw = twiddles ;
536+ tw = twiddles + 3 ;
544537
545538 // first butterfly
546- ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , tw );
539+ ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , NULL );
547540
548- tw ++ ;
549541 Fin_neon ++ ;
550542 Fout_neon ++ ;
551543
552544 // other butterfly
545+ // Twiddle tables are transposed to avoid memory access by a large stride.
553546 ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon , Fin_neon , out_step , in_step , tw );
554547
555548 // update Fin_r, Fout_r, twiddles
556- tw += ( (out_step >> 1 ) - 1 );
557549 Fin_neon += 2 * ( (out_step >> 1 ) - 1 );
558550 Fout_neon += 2 * ( (out_step >> 1 ) - 1 );
559551
560552 // last butterfly
561- ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon , Fin_neon , out_step , in_step , tw );
553+ ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon , Fin_neon , out_step , in_step , NULL );
562554 Fin_neon ++ ;
563- tw ++ ;
564555 Fout_neon ++ ;
565556
566557 Fin_neon = Fin_neon + 3 * out_step ;
0 commit comments