@@ -187,40 +187,56 @@ Power of 4 FFT, in place
187187
188188"""
189189function fft_pow4! (out:: AbstractVector{T} , in:: AbstractVector{U} , N:: Int , start_out:: Int , stride_out:: Int , start_in:: Int , stride_in:: Int , w:: T ) where {T, U}
190- plusi = sign (imag (w))* im
191190 minusi = - sign (imag (w))* im
192- if N == 4
193- out[start_out + 0 ] = in[start_in] + in[start_in + stride_in] + in[start_in + 2 * stride_in] + in[start_in + 3 * stride_in]
194- out[start_out + stride_out] = in[start_in] + in[start_in + stride_in]* plusi - in[start_in + 2 * stride_in] + in[start_in + 3 * stride_in]* minusi
195- out[start_out + 2 * stride_out] = in[start_in] - in[start_in + stride_in] + in[start_in + 2 * stride_in] - in[start_in + 3 * stride_in]
196- out[start_out + 3 * stride_out] = in[start_in] + in[start_in + stride_in]* minusi - in[start_in + 2 * stride_in] + in[start_in + 3 * stride_in]* plusi
191+ @inbounds if N == 4
192+ xee = in[start_in]
193+ xoe = in[start_in + stride_in]
194+ xeo = in[start_in + 2 * stride_in]
195+ xoo = in[start_in + 3 * stride_in]
196+ xee_p_xeo = xee + xeo
197+ xee_m_xeo = xee - xeo
198+ xoe_p_xoo = xoe + xoo
199+ xoe_m_xoo = - (xoe - xoo)* minusi
200+ out[start_out] = xee_p_xeo + xoe_p_xoo
201+ out[start_out + stride_out] = xee_m_xeo + xoe_m_xoo
202+ out[start_out + 2 * stride_out] = xee_p_xeo - xoe_p_xoo
203+ out[start_out + 3 * stride_out] = xee_m_xeo - xoe_m_xoo
197204 return
198205 end
199206 m = N ÷ 4
200207
201- @muladd fft_pow4! (out, in, m, start_out , stride_out, start_in , stride_in* 4 , w^ 4 )
202- @muladd fft_pow4! (out, in, m, start_out + m* stride_out, stride_out, start_in + stride_in, stride_in* 4 , w^ 4 )
203- @muladd fft_pow4! (out, in, m, start_out + 2 * m* stride_out, stride_out, start_in + 2 * stride_in, stride_in* 4 , w^ 4 )
204- @muladd fft_pow4! (out, in, m, start_out + 3 * m* stride_out, stride_out, start_in + 3 * stride_in, stride_in* 4 , w^ 4 )
205-
206208 w1 = w
207209 w2 = w* w1
208210 w3 = w* w2
209- wk1 = wk2 = wk3 = one (T)
211+ w4 = w* w3
212+
213+ fft_pow4! (out, in, m, start_out , stride_out, start_in , stride_in* 4 , w4)
214+ fft_pow4! (out, in, m, start_out + m* stride_out, stride_out, start_in + stride_in, stride_in* 4 , w4)
215+ fft_pow4! (out, in, m, start_out + 2 * m* stride_out, stride_out, start_in + 2 * stride_in, stride_in* 4 , w4)
216+ fft_pow4! (out, in, m, start_out + 3 * m* stride_out, stride_out, start_in + 3 * stride_in, stride_in* 4 , w4)
217+
218+ wkoe = wkeo = wkoo = one (T)
210219
211220 @inbounds for k in 0 : m- 1
212- @muladd k0 = start_out + k* stride_out
213- @muladd k1 = start_out + (k+ m)* stride_out
214- @muladd k2 = start_out + (k+ 2 * m)* stride_out
215- @muladd k3 = start_out + (k+ 3 * m)* stride_out
216- y_k0, y_k1, y_k2, y_k3 = out[k0], out[k1], out[k2], out[k3]
217- @muladd out[k0] = (y_k0 + y_k2* wk2) + (y_k1* wk1 + y_k3* wk3)
218- @muladd out[k1] = (y_k0 - y_k2* wk2) + (y_k1* wk1 - y_k3* wk3) * plusi
219- @muladd out[k2] = (y_k0 + y_k2* wk2) - (y_k1* wk1 + y_k3* wk3)
220- @muladd out[k3] = (y_k0 - y_k2* wk2) + (y_k1* wk1 - y_k3* wk3) * minusi
221- wk1 *= w1
222- wk2 *= w2
223- wk3 *= w3
221+ kee = start_out + k * stride_out
222+ koe = start_out + (k + m) * stride_out
223+ keo = start_out + (k + 2 * m) * stride_out
224+ koo = start_out + (k + 3 * m) * stride_out
225+ y_kee, y_koe, y_keo, y_koo = out[kee], out[koe], out[keo], out[koo]
226+ ỹ_keo = y_keo* wkeo
227+ ỹ_koe = y_koe* wkoe
228+ ỹ_koo = y_koo* wkoo
229+ y_kee_p_y_keo = y_kee + ỹ_keo
230+ y_kee_m_y_keo = y_kee - ỹ_keo
231+ ỹ_koe_p_ỹ_koo = ỹ_koe + ỹ_koo
232+ ỹ_koe_m_ỹ_koo = - (ỹ_koe - ỹ_koo) * minusi
233+ out[kee] = y_kee_p_y_keo + ỹ_koe_p_ỹ_koo
234+ out[koe] = y_kee_m_y_keo + ỹ_koe_m_ỹ_koo
235+ out[keo] = y_kee_p_y_keo - ỹ_koe_p_ỹ_koo
236+ out[koo] = y_kee_m_y_keo - ỹ_koe_m_ỹ_koo
237+ wkoe *= w1
238+ wkeo *= w2
239+ wkoo *= w3
224240 end
225241end
226242
0 commit comments