Skip to content

Commit 7107d46

Browse files
authored
Merge pull request #63 from JuliaMath/an/fasterpow4
Faster pow4 kernel
2 parents 9e73ae1 + 3d13464 commit 7107d46

File tree

1 file changed

+40
-24
lines changed

1 file changed

+40
-24
lines changed

src/algos.jl

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -187,40 +187,56 @@ Power of 4 FFT, in place
187187
188188
"""
189189
function fft_pow4!(out::AbstractVector{T}, in::AbstractVector{U}, N::Int, start_out::Int, stride_out::Int, start_in::Int, stride_in::Int, w::T) where {T, U}
190-
plusi = sign(imag(w))*im
191190
minusi = -sign(imag(w))*im
192-
if N == 4
193-
out[start_out + 0] = in[start_in] + in[start_in + stride_in] + in[start_in + 2*stride_in] + in[start_in + 3*stride_in]
194-
out[start_out + stride_out] = in[start_in] + in[start_in + stride_in]*plusi - in[start_in + 2*stride_in] + in[start_in + 3*stride_in]*minusi
195-
out[start_out + 2*stride_out] = in[start_in] - in[start_in + stride_in] + in[start_in + 2*stride_in] - in[start_in + 3*stride_in]
196-
out[start_out + 3*stride_out] = in[start_in] + in[start_in + stride_in]*minusi - in[start_in + 2*stride_in] + in[start_in + 3*stride_in]*plusi
191+
@inbounds if N == 4
192+
xee = in[start_in]
193+
xoe = in[start_in + stride_in]
194+
xeo = in[start_in + 2*stride_in]
195+
xoo = in[start_in + 3*stride_in]
196+
xee_p_xeo = xee + xeo
197+
xee_m_xeo = xee - xeo
198+
xoe_p_xoo = xoe + xoo
199+
xoe_m_xoo = -(xoe - xoo)*minusi
200+
out[start_out] = xee_p_xeo + xoe_p_xoo
201+
out[start_out + stride_out] = xee_m_xeo + xoe_m_xoo
202+
out[start_out + 2*stride_out] = xee_p_xeo - xoe_p_xoo
203+
out[start_out + 3*stride_out] = xee_m_xeo - xoe_m_xoo
197204
return
198205
end
199206
m = N ÷ 4
200207

201-
@muladd fft_pow4!(out, in, m, start_out , stride_out, start_in , stride_in*4, w^4)
202-
@muladd fft_pow4!(out, in, m, start_out + m*stride_out, stride_out, start_in + stride_in, stride_in*4, w^4)
203-
@muladd fft_pow4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w^4)
204-
@muladd fft_pow4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w^4)
205-
206208
w1 = w
207209
w2 = w*w1
208210
w3 = w*w2
209-
wk1 = wk2 = wk3 = one(T)
211+
w4 = w*w3
212+
213+
fft_pow4!(out, in, m, start_out , stride_out, start_in , stride_in*4, w4)
214+
fft_pow4!(out, in, m, start_out + m*stride_out, stride_out, start_in + stride_in, stride_in*4, w4)
215+
fft_pow4!(out, in, m, start_out + 2*m*stride_out, stride_out, start_in + 2*stride_in, stride_in*4, w4)
216+
fft_pow4!(out, in, m, start_out + 3*m*stride_out, stride_out, start_in + 3*stride_in, stride_in*4, w4)
217+
218+
wkoe = wkeo = wkoo = one(T)
210219

211220
@inbounds for k in 0:m-1
212-
@muladd k0 = start_out + k*stride_out
213-
@muladd k1 = start_out + (k+m)*stride_out
214-
@muladd k2 = start_out + (k+2*m)*stride_out
215-
@muladd k3 = start_out + (k+3*m)*stride_out
216-
y_k0, y_k1, y_k2, y_k3 = out[k0], out[k1], out[k2], out[k3]
217-
@muladd out[k0] = (y_k0 + y_k2*wk2) + (y_k1*wk1 + y_k3*wk3)
218-
@muladd out[k1] = (y_k0 - y_k2*wk2) + (y_k1*wk1 - y_k3*wk3) * plusi
219-
@muladd out[k2] = (y_k0 + y_k2*wk2) - (y_k1*wk1 + y_k3*wk3)
220-
@muladd out[k3] = (y_k0 - y_k2*wk2) + (y_k1*wk1 - y_k3*wk3) * minusi
221-
wk1 *= w1
222-
wk2 *= w2
223-
wk3 *= w3
221+
kee = start_out + k * stride_out
222+
koe = start_out + (k + m) * stride_out
223+
keo = start_out + (k + 2 * m) * stride_out
224+
koo = start_out + (k + 3 * m) * stride_out
225+
y_kee, y_koe, y_keo, y_koo = out[kee], out[koe], out[keo], out[koo]
226+
ỹ_keo = y_keo*wkeo
227+
ỹ_koe = y_koe*wkoe
228+
ỹ_koo = y_koo*wkoo
229+
y_kee_p_y_keo = y_kee + ỹ_keo
230+
y_kee_m_y_keo = y_kee - ỹ_keo
231+
ỹ_koe_p_ỹ_koo = ỹ_koe + ỹ_koo
232+
ỹ_koe_m_ỹ_koo = -(ỹ_koe - ỹ_koo) * minusi
233+
out[kee] = y_kee_p_y_keo + ỹ_koe_p_ỹ_koo
234+
out[koe] = y_kee_m_y_keo + ỹ_koe_m_ỹ_koo
235+
out[keo] = y_kee_p_y_keo - ỹ_koe_p_ỹ_koo
236+
out[koo] = y_kee_m_y_keo - ỹ_koe_m_ỹ_koo
237+
wkoe *= w1
238+
wkeo *= w2
239+
wkoo *= w3
224240
end
225241
end
226242

0 commit comments

Comments
 (0)