-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathfft12.cl
More file actions
81 lines (60 loc) · 2.04 KB
/
fft12.cl
File metadata and controls
81 lines (60 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright (C) Mihai Preda and George Woltman
#if FFT_FP64
#if 1
#include "fft3.cl"
#include "fft4.cl"
// 24 FMA + 72 ADD
void fft12(T2 *u) {
fft3by(u, 0, 4, 12);
fft3by(u, 3, 4, 12);
fft3by(u, 6, 4, 12);
fft3by(u, 9, 4, 12);
fft4by(u, 0, 3, 12);
fft4by(u, 4, 3, 12);
fft4by(u, 8, 3, 12);
// Fix order: 0 7 2 9 4 11 6 1 8 3 10 5
SWAP(u[1], u[7]);
SWAP(u[3], u[9]);
SWAP(u[5], u[11]);
}
#else
// 24 FMA + 72 ADD
// See prime95's gwnum/zr12.mac file for more detailed explanation of the formulas below
void fft12(T2 *u) {
const double SIN1 = 0x1.bb67ae8584caap-1; // sin(tau/3), 0.86602540378443859659;
const double COS1 = 0.5; // cos(tau/3)
X2(u[0], u[6]); // (r1+ i1+), (r1- i1-)
X2_mul_t4(u[3], u[9]); // (r4+ i4+), (i4- -r4-)
X2_mul_t4(u[1], u[7]); // (r2+ i2+), (i2- -r2-)
X2_mul_t4(u[5], u[11]); // (r6+ i6+), (i6- -r6-)
X2_mul_t4(u[2], u[8]); // (r3+ i3+), (i3- -r3-)
X2_mul_t4(u[4], u[10]); // (r5+ i5+), (i5- -r5-)
X2(u[0], u[3]); // (r1++ i1++), (r1+- i1+-)
X2_mul_t4(u[1], u[5]); // (r2++ i2++), (i2+- -r2+-)
X2_mul_t4(u[2], u[4]); // (r3++ i3++), (i3+- -r3+-)
X2_mul_t4(u[7], u[11]); // (i2-+ -r2-+), (-r2-- -i2--)
X2_mul_t4(u[8], u[10]); // (i3-+ -r3-+), (-r3-- -i3--)
X2(u[2], u[1]); // (r3+++ i3+++), (r3++- i3++-)
X2(u[4], u[5]); // (i3+-+ -r3+-+), (i3+-- -r3+--)
T2 tmp26812b = fmaT2(COS1, u[7], u[9]);
T2 tmp410b = u[9] - u[7];
T2 tmp26812a = fmaT2(-COS1, u[10], u[6]);
T2 tmp410a = u[6] + u[10];
T2 tmp68a, tmp68b, tmp212a, tmp212b;
fma_addsub(tmp212b, tmp68b, SIN1, tmp26812b, u[8]);
fma_addsub(tmp68a, tmp212a, SIN1, tmp26812a, u[11]);
T2 tmp311 = fmaT2(-COS1, u[1], u[3]);
u[6] = u[3] + u[1];
T2 tmp59 = fmaT2(-COS1, u[2], u[0]);
u[0] = u[0] + u[2];
u[3] = tmp410a - tmp410b;
u[9] = tmp410a + tmp410b;
u[1] = tmp212a + tmp212b;
u[11] = tmp212a - tmp212b;
fma_addsub(u[2], u[10], SIN1, tmp311, u[4]);
fma_addsub(u[8], u[4], SIN1, tmp59, u[5]);
u[5] = tmp68a + tmp68b;
u[7] = tmp68a - tmp68b;
}
#endif
#endif