Improve data layout in RFFT float32

Phil Wang · Phil.Wang · commit 1d409d2dccc2 · 2015-07-20T12:38:31.000+08:00
Transpose part of twiddles in RFFT float32 to avoid memory access by
  a large stride.

Change-Id: I5e05c5baed523183ed3948371e6b1fbffc916e9b
diff --git a/modules/dsp/NE10_fft.c b/modules/dsp/NE10_fft.c
@@ -149,10 +149,10 @@ ne10_int32_t ne10_factor (ne10_int32_t n,
     return NE10_OK;
 }
 
-// Twiddles matrix [mstride][radix-1]
-// First column (k == 0)is ignored because phase == 1, and
+// Twiddles matrix [radix-1][mstride]
+// First column (k == 0) is ignored because phase == 1, and
 // twiddle = (1.0, 0.0).
-static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
+void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
         const ne10_int32_t mstride,
         const ne10_int32_t fstride,
         const ne10_int32_t radix,
@@ -173,6 +173,33 @@ static void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * tw
     } // mstride
 }
 
+// Transposed twiddles matrix [mstride][radix-1]
+// First row (k == 0) is ignored because phase == 1, and
+// twiddle = (1.0, 0.0).
+// Transposed twiddle tables are used in RFFT to avoid memory access by a large
+// stride.
+void ne10_fft_generate_twiddles_line_transposed_float32 (
+    ne10_fft_cpx_float32_t* twiddles,
+    const ne10_int32_t mstride,
+    const ne10_int32_t fstride,
+    const ne10_int32_t radix,
+    const ne10_int32_t nfft)
+{
+    ne10_int32_t j, k;
+    ne10_float32_t phase;
+    const ne10_float64_t pi = NE10_PI;
+
+    for (j = 0; j < mstride; j++)
+    {
+        for (k = 1; k < radix; k++) // phase = 1 when k = 0
+        {
+            phase = -2 * pi * fstride * k * j / nfft;
+            twiddles[(radix - 1) * j + k - 1].r = (ne10_float32_t) cos (phase);
+            twiddles[(radix - 1) * j + k - 1].i = (ne10_float32_t) sin (phase);
+        } // radix
+    } // mstride
+}
+
 // Twiddles matrix [mstride][radix-1]
 // First column (k == 0)is ignored because phase == 1, and
 // twiddle = (1.0, 0.0).
@@ -232,9 +259,17 @@ ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * t
     return twiddles;
 }
 
-ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
-        const ne10_int32_t * factors,
-        const ne10_int32_t nfft )
+typedef void (*line_generator_float32)(ne10_fft_cpx_float32_t*,
+      const ne10_int32_t,
+      const ne10_int32_t,
+      const ne10_int32_t,
+      const ne10_int32_t);
+
+ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_impl_float32 (
+      line_generator_float32 generator,
+      ne10_fft_cpx_float32_t * twiddles,
+      const ne10_int32_t * factors,
+      const ne10_int32_t nfft)
 {
     ne10_int32_t stage_count = factors[0];
     ne10_int32_t fstride = factors[1];
@@ -248,7 +283,7 @@ ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32
         twiddles[0].r = 1.0;
         twiddles[0].i = 0.0;
         twiddles += 1;
-        ne10_fft_generate_twiddles_line_float32 (twiddles, 1, fstride, cur_radix, nfft);
+        generator (twiddles, 1, fstride, cur_radix, nfft);
         twiddles += cur_radix - 1;
     }
     stage_count --;
@@ -259,13 +294,35 @@ ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32
         cur_radix = factors[2 * stage_count];
         fstride /= cur_radix;
         mstride = factors[2 * stage_count + 1];
-        ne10_fft_generate_twiddles_line_float32 (twiddles, mstride, fstride, cur_radix, nfft);
+        generator (twiddles, mstride, fstride, cur_radix, nfft);
         twiddles += mstride * (cur_radix - 1);
     } // stage_count
 
     return twiddles;
 }
 
+ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
+        const ne10_int32_t * factors,
+        const ne10_int32_t nfft )
+{
+    line_generator_float32 generator = ne10_fft_generate_twiddles_line_float32;
+    twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
+        twiddles, factors, nfft);
+    return twiddles;
+}
+
+ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
+      ne10_fft_cpx_float32_t * twiddles,
+      const ne10_int32_t * factors,
+      const ne10_int32_t nfft)
+{
+    line_generator_float32 generator =
+        ne10_fft_generate_twiddles_line_transposed_float32;
+    twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
+        twiddles, factors, nfft);
+    return twiddles;
+}
+
 /**
  * @addtogroup C2C_FFT_IFFT
  * @{
diff --git a/modules/dsp/NE10_fft.h b/modules/dsp/NE10_fft.h
@@ -76,6 +76,11 @@ extern "C" {
         const ne10_int32_t * factors,
         const ne10_int32_t nfft );
 
+    extern ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
+        ne10_fft_cpx_float32_t * twiddles,
+        const ne10_int32_t * factors,
+        const ne10_int32_t nfft );
+
     extern ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * twiddles,
         const ne10_int32_t * factors,
         const ne10_int32_t nfft );
diff --git a/modules/dsp/NE10_rfft_float32.c b/modules/dsp/NE10_rfft_float32.c
@@ -798,7 +798,11 @@ ne10_fft_r2c_cfg_float32_t ne10_fft_alloc_r2c_float32 (ne10_int32_t nfft)
         return st;
     }
 
-    st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_float32 (st->r_twiddles_neon, st->r_factors_neon, nfft/4);
+    // Twiddle table is transposed here to improve cache access performance.
+    st->r_twiddles_neon_backward = ne10_fft_generate_twiddles_transposed_float32 (
+        st->r_twiddles_neon,
+        st->r_factors_neon,
+        nfft/4);
 
     // nfft/4 x 4
     tw = st->r_super_twiddles_neon;
diff --git a/modules/dsp/NE10_rfft_float32.neonintrinsic.c b/modules/dsp/NE10_rfft_float32.neonintrinsic.c
@@ -284,11 +284,11 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
         q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
         q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
 
-        q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
-        q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
+        q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
+        q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
 
-        q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
-        q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
+        q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
+        q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
 
         // R2C TW KERNEL
         NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
@@ -298,17 +298,13 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
 #else // __aarch64__
         const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
         const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
-        const ne10_float32_t *ptr_tw  = (const ne10_float32_t *) twiddles;
-
         asm volatile (
             "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
             "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
             "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
             "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
             "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
-            "ld1 {v0.1d},  [%[ptr_tw]], %[offset_out] \n\t"
-            "ld1 {v1.1d},  [%[ptr_tw]], %[offset_out] \n\t"
-            "ld1 {v2.1d},  [%[ptr_tw]] \n\t"
+            "ld1 {v0.1d, v1.1d, v2.1d},  [%[ptr_tw]] \n\t"
 
             "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
             "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
@@ -333,10 +329,9 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
           [q2_out3r]"+w"(q2_out3.val[0]),
           [q2_out3i]"+w"(q2_out3.val[1]),
           [ptr_inr]"+r"(ptr_inr),
-          [ptr_ini]"+r"(ptr_ini),
-          [ptr_tw]"+r"(ptr_tw)
+          [ptr_ini]"+r"(ptr_ini)
         : [offset_in]"r"(in_step * 16),
-          [offset_out]"r"(out_step * 8)
+          [ptr_tw]"r"(twiddles)
         : "memory", "v0", "v1", "v2",
           "v10", "v11", "v12", "v13", "v14", "v15"
         );
@@ -363,7 +358,7 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4
         Fin_neon  += 2;
         Fout_neon += 2;
         Fout_b    -= 2;
-        twiddles ++;
+        twiddles += 3;
     }
 }
 
@@ -399,11 +394,11 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
         q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
         q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
 
-        q2_tw1.val[0] = vdupq_n_f32(twiddles[out_step].r);
-        q2_tw1.val[1] = vdupq_n_f32(twiddles[out_step].i);
+        q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
+        q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
 
-        q2_tw2.val[0] = vdupq_n_f32(twiddles[out_step*2].r);
-        q2_tw2.val[1] = vdupq_n_f32(twiddles[out_step*2].i);
+        q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
+        q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
 
         // NE10_PRINT_Q2x4_VECTOR(q2_in);
 
@@ -429,7 +424,7 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4
         Fin_neon  += 2;
         Fout_neon += 2;
         Fin_b    -= 2;
-        twiddles ++;
+        twiddles += 3;
     }
 }
 
@@ -496,27 +491,25 @@ NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *F
 
     for (f_count = fstride; f_count; f_count --)
     {
-        tw = twiddles;
+        tw = twiddles + 3;
 
         // first butterfly
-        ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+        ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
 
-        tw ++;
         Fin_neon ++;
         Fout_neon ++;
 
         // other butterfly
+        // Twiddle tables are transposed to avoid memory access by a large stride.
         ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
 
         // update Fin_r, Fout_r, twiddles
-        tw        +=     ( (out_step >> 1) - 1);
         Fin_neon  += 2 * ( (out_step >> 1) - 1);
         Fout_neon += 2 * ( (out_step >> 1) - 1);
 
         // last butterfly
-        ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
+        ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
         Fin_neon ++;
-        tw++;
         Fout_neon ++;
 
         Fout_neon = Fout_neon + 3 * out_step;
@@ -540,27 +533,25 @@ NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *F
 
     for (f_count = fstride; f_count; f_count --)
     {
-        tw = twiddles;
+        tw = twiddles + 3;
 
         // first butterfly
-        ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
+        ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
 
-        tw ++;
         Fin_neon ++;
         Fout_neon ++;
 
         // other butterfly
+        // Twiddle tables are transposed to avoid memory access by a large stride.
         ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
 
         // update Fin_r, Fout_r, twiddles
-        tw        +=     ( (out_step >> 1) - 1);
         Fin_neon  += 2 * ( (out_step >> 1) - 1);
         Fout_neon += 2 * ( (out_step >> 1) - 1);
 
         // last butterfly
-        ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, tw);
+        ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
         Fin_neon ++;
-        tw++;
         Fout_neon ++;
 
         Fin_neon = Fin_neon + 3 * out_step;